From 6adb2902107c29ba8770e20a5e8a5a1dd0f750c0 Mon Sep 17 00:00:00 2001 From: reyna-abhyankar Date: Sat, 24 Aug 2024 23:43:58 -0700 Subject: [PATCH 01/91] temporary weight adjust index --- lib/local-execution/src/local_slots_backing.cc | 14 +++++++++++--- .../test/src/test_local_slots_backing.cc | 2 +- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc index 0ec9068c6a..c8d186a0fe 100644 --- a/lib/local-execution/src/local_slots_backing.cc +++ b/lib/local-execution/src/local_slots_backing.cc @@ -76,13 +76,22 @@ GenericTensorAccessorW const & TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( OpTaskBinding const &binding, layer_guid_t const &op_guid) const { TensorSlotsBacking mapping; + int num_inputs = 0; + for (auto const &tensor_binding : binding.get_tensor_bindings()) { + if (tensor_binding.first.is_grad == IsGrad::NO && tensor_binding.second.role == TensorRole::INPUT) { + num_inputs += 1; + } + } + for (auto const &tensor_binding : binding.get_tensor_bindings()) { SlotGradId slot_grad_id = tensor_binding.first; OpTensorSpec tensor_spec = tensor_binding.second; std::vector tensor_guids; + int weight_adjusted_idx = 0; switch (tensor_spec.role) { - case TensorRole::INPUT: case TensorRole::WEIGHT: + weight_adjusted_idx = num_inputs; + case TensorRole::INPUT: assert(contains_key(this->input_tensor_slots, op_guid)); tensor_guids = this->input_tensor_slots.at(op_guid); break; @@ -96,10 +105,9 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( // "type_is_unformattable" error } - assert(tensor_guids.size() > tensor_spec.idx); IsGrad is_grad = slot_grad_id.is_grad; GenericTensorAccessorW tensor_backing = - this->get_tensor_backing(tensor_guids.at(tensor_spec.idx), is_grad); + this->get_tensor_backing(tensor_guids.at(weight_adjusted_idx + tensor_spec.idx), is_grad); mapping.insert({slot_grad_id, tensor_backing}); } diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc index 542aa66087..e31e7cf2b4 100644 --- a/lib/local-execution/test/src/test_local_slots_backing.cc +++ b/lib/local-execution/test/src/test_local_slots_backing.cc @@ -188,7 +188,7 @@ TEST_SUITE(FF_TEST_SUITE) { b.bind(QUERY, input_tensor(0)); b.bind(KEY, input_tensor(1)); b.bind(VALUE, input_tensor(2)); - b.bind(WEIGHTS, weight_tensor(3)); + b.bind(WEIGHTS, weight_tensor(0)); b.bind(OUTPUT, output_tensor(0)); b.bind_grad(QUERY, input_tensor(0)); From 61697c2a30338ae39fa10ef35899f519c8d2e514 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 27 Aug 2024 01:45:52 -0700 Subject: [PATCH 02/91] Loss function --- lib/kernels/CMakeLists.txt | 1 + .../include/kernels/optimizer_kernels.h | 6 +- ...timizer_kernel.cu => optimizer_kernels.cu} | 1 + .../generic_task_impl_function.h | 33 +++++++ .../local-execution/local_slots_backing.h | 11 ++- .../local-execution/local_training_backing.h | 11 ++- .../include/local-execution}/loss_functions.h | 22 ++--- .../model_training_instance.struct.toml | 26 +++++ .../task_arg_spec.variant.toml | 18 ++++ .../task_impl_function.variant.toml | 5 + .../include/local-execution/task_invocation.h | 71 ++++++++++++++ .../include/local-execution/task_signature.h | 57 +++++++++++ .../task_signature.struct.toml | 29 ++++++ .../tensor_guid_slot_spec.struct.toml | 27 ++++++ .../tensor_guid_spec.struct.toml | 22 +++++ .../src/generic_task_impl_function.cc | 53 ++++++++++ .../src/local_cost_estimator.cc | 3 +- .../src/local_slots_backing.cc | 51 +++++++++- .../src/local_training_backing.cc | 50 +++++++++- .../src/loss_functions.cc | 96 ++++++++----------- lib/local-execution/src/ops/attention.cc | 2 +- .../local-execution => src}/ops/attention.h | 0 lib/local-execution/src/task_invocation.cc | 49 ++++++++++ lib/local-execution/src/task_signature.cc | 25 +++++ .../src/task_signature_impl.cc | 2 +- .../test/src/test_task_registry.cc | 1 - .../op-attrs/ops/loss_attrs.variant.toml | 22 +++++ .../op-attrs/ops/loss_function.enum.toml | 23 +++++ .../include/op-attrs/ops/loss_functions.h | 68 +------------ .../op-attrs/ops/other_loss_attrs.struct.toml | 18 ++++ ...arse_categorical_ce_loss_attrs.struct.toml | 14 +++ lib/op-attrs/src/loss_functions.cc | 25 ++--- 32 files changed, 671 insertions(+), 171 deletions(-) rename lib/kernels/src/cuda/{optimizer_kernel.cu => optimizer_kernels.cu} (99%) create mode 100644 lib/local-execution/include/local-execution/generic_task_impl_function.h rename lib/{runtime/src => local-execution/include/local-execution}/loss_functions.h (63%) create mode 100644 lib/local-execution/include/local-execution/model_training_instance.struct.toml create mode 100644 lib/local-execution/include/local-execution/task_arg_spec.variant.toml create mode 100644 lib/local-execution/include/local-execution/task_invocation.h create mode 100644 lib/local-execution/include/local-execution/task_signature.h create mode 100644 lib/local-execution/include/local-execution/task_signature.struct.toml create mode 100644 lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml create mode 100644 lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml create mode 100644 lib/local-execution/src/generic_task_impl_function.cc rename lib/{runtime => local-execution}/src/loss_functions.cc (63%) rename lib/local-execution/{include/local-execution => src}/ops/attention.h (100%) create mode 100644 lib/local-execution/src/task_invocation.cc create mode 100644 lib/local-execution/src/task_signature.cc create mode 100644 lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml create mode 100644 lib/op-attrs/include/op-attrs/ops/loss_function.enum.toml create mode 100644 lib/op-attrs/include/op-attrs/ops/other_loss_attrs.struct.toml create mode 100644 lib/op-attrs/include/op-attrs/ops/sparse_categorical_ce_loss_attrs.struct.toml diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt index 8ccd7c1011..54fa3c9583 100644 --- a/lib/kernels/CMakeLists.txt +++ b/lib/kernels/CMakeLists.txt @@ -8,6 +8,7 @@ file(GLOB_RECURSE SRC LIST_DIRECTORIES False src/*.cc src/cuda/cuda_helper.cu + src/cuda/loss_functions_kernels.cu src/cuda/ops/*.cu ) diff --git a/lib/kernels/include/kernels/optimizer_kernels.h b/lib/kernels/include/kernels/optimizer_kernels.h index 9ca6bf8e2b..fcbf9454f8 100644 --- a/lib/kernels/include/kernels/optimizer_kernels.h +++ b/lib/kernels/include/kernels/optimizer_kernels.h @@ -1,7 +1,8 @@ #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H -#include "device.h" +#include "kernels/device.h" +#include "kernels/ff_handle.h" namespace FlexFlow { @@ -20,7 +21,8 @@ void sgd_nccl_update_task_gpu(ffStream_t, float lr, float momentum, bool nesterov, - float weight_decay PerDeviceFFHandle const &, + float weight_decay, + PerDeviceFFHandle const &, float const *weight_grad_ptr, size_t size, float *weight_ptr, diff --git a/lib/kernels/src/cuda/optimizer_kernel.cu b/lib/kernels/src/cuda/optimizer_kernels.cu similarity index 99% rename from lib/kernels/src/cuda/optimizer_kernel.cu rename to lib/kernels/src/cuda/optimizer_kernels.cu index 439eed9dec..1bb38b2870 100644 --- a/lib/kernels/src/cuda/optimizer_kernel.cu +++ b/lib/kernels/src/cuda/optimizer_kernels.cu @@ -13,6 +13,7 @@ * limitations under the License. */ +#include "device.h" #include "kernels/optimizer_kernels.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/generic_task_impl_function.h b/lib/local-execution/include/local-execution/generic_task_impl_function.h new file mode 100644 index 0000000000..425740f61d --- /dev/null +++ b/lib/local-execution/include/local-execution/generic_task_impl_function.h @@ -0,0 +1,33 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_GENERIC_TASK_IMPL_FUNCTION_H +#define _FLEXFLOW_LOCAL_EXECUTION_GENERIC_TASK_IMPL_FUNCTION_H + +#include "local-execution/device_specific_device_states.dtg.h" +#include "local-execution/task_argument_accessor.h" + +namespace FlexFlow { + +struct GenericTaskImplFunction { + + void (*function_ptr)(TaskArgumentAccessor const &); + + bool operator==(GenericTaskImplFunction const &) const; + bool operator!=(GenericTaskImplFunction const &) const; + bool operator<(GenericTaskImplFunction const &) const; + bool operator>(GenericTaskImplFunction const &) const; + bool operator<=(GenericTaskImplFunction const &) const; + bool operator>=(GenericTaskImplFunction const &) const; +}; + +std::string format_as(GenericTaskImplFunction const &x); +std::ostream &operator<<(std::ostream &s, GenericTaskImplFunction const &x); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash<::FlexFlow::GenericTaskImplFunction> { + size_t operator()(::FlexFlow::GenericTaskImplFunction const &) const; +}; +} // namespace std + +#endif diff --git a/lib/local-execution/include/local-execution/local_slots_backing.h b/lib/local-execution/include/local-execution/local_slots_backing.h index 6a0c28e988..312a13cc01 100644 --- a/lib/local-execution/include/local-execution/local_slots_backing.h +++ b/lib/local-execution/include/local-execution/local_slots_backing.h @@ -7,6 +7,7 @@ #include "local-execution/op_task_invocation.h" #include "local-execution/per_device_op_state.h" #include "local-execution/runtime_arg_config.h" +#include "local-execution/task_invocation.h" namespace FlexFlow { @@ -19,23 +20,29 @@ struct LocalSlotsBacking { public: void add_per_device_op_state(layer_guid_t const &, DeviceSpecificDeviceStates const &); + void allocate_label_tensor(tensor_guid_t const &, + ComputationGraph const &, + Allocator &); void allocate_outgoing_tensors(layer_guid_t const &, ComputationGraph const &, Allocator &); TensorSlotsBacking construct_tensor_slots_backing(OpTaskBinding const &, layer_guid_t const &) const; + TensorSlotsBacking construct_tensor_slots_backing(TaskBinding const &) const; ArgSlotsBacking construct_arg_slots_backing(OpTaskBinding const &, layer_guid_t const &) const; + ArgSlotsBacking construct_arg_slots_backing(TaskBinding const &) const; ConcreteArgSpec resolve_runtime_arg_ref_spec(RuntimeArgRefSpec const &) const; ConcreteArgSpec resolve_op_arg_ref_spec(OpArgRefSpec const &, layer_guid_t const &) const; + GenericTensorAccessorW const &get_tensor_backing(tensor_guid_t const &, + IsGrad) const; + private: bool is_tensor_allocated(tensor_guid_t const &) const; bool is_gradient_tensor_allocated(tensor_guid_t const &) const; - GenericTensorAccessorW const &get_tensor_backing(tensor_guid_t const &, - IsGrad) const; public: // tensors diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h index b398bb8cc3..55983086c2 100644 --- a/lib/local-execution/include/local-execution/local_training_backing.h +++ b/lib/local-execution/include/local-execution/local_training_backing.h @@ -2,7 +2,9 @@ #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TRAINING_BACKING_H #include "local-execution/local_slots_backing.h" +#include "local-execution/model_training_instance.dtg.h" #include "local-execution/task_registry.h" +#include "op-attrs/ops/loss_functions.h" namespace FlexFlow { @@ -13,15 +15,17 @@ struct LocalTrainingBacking { LocalTrainingBacking(Allocator const &, ComputationGraph const &, TensorBackingMap const &, - RuntimeArgConfig const &); + RuntimeArgConfig const &, + std::optional const &); void execute_init(); PerLayerElapsedTime execute_forward(); PerLayerElapsedTime execute_backward(); void execute_update(); - TaskArgumentAccessor get_task_arg_accessor(OpTaskInvocation const &, - layer_guid_t const &) const; + TaskArgumentAccessor get_task_arg_accessor(TaskInvocation const &) const; + TaskArgumentAccessor get_op_task_arg_accessor(OpTaskInvocation const &, + layer_guid_t const &) const; private: DeviceSpecificDeviceStates call_init_task_impl(task_id_t, @@ -33,6 +37,7 @@ struct LocalTrainingBacking { ComputationGraph computation_graph; TaskRegistry task_registry; LocalSlotsBacking local_slots_backing; + std::optional training_instance; }; } // namespace FlexFlow diff --git a/lib/runtime/src/loss_functions.h b/lib/local-execution/include/local-execution/loss_functions.h similarity index 63% rename from lib/runtime/src/loss_functions.h rename to lib/local-execution/include/local-execution/loss_functions.h index 620ebc6936..e5e81b60a7 100644 --- a/lib/runtime/src/loss_functions.h +++ b/lib/local-execution/include/local-execution/loss_functions.h @@ -13,24 +13,20 @@ * limitations under the License. */ -#ifndef _FF_LOSS_FUNCTIONS_H_ -#define _FF_LOSS_FUNCTIONS_H_ +#ifndef _FLEXFLOW_LOSS_FUNCTIONS_H_ +#define _FLEXFLOW_LOSS_FUNCTIONS_H_ +#include "local-execution/task_impl_function.dtg.h" +#include "local-execution/task_invocation.h" +#include "local-execution/task_signature.h" #include "op-attrs/ops/loss_functions.h" -#include "pcg/operator.h" -#include "pcg/parallel_tensor.h" -#include "pcg/parallel_tensor_guid_t.h" -#include "task_spec/task_invocation.h" -#include "tasks.h" namespace FlexFlow { -template <> -void register_task(); - -TaskInvocation backward(LossAttrs const &, - parallel_tensor_guid_t logit, - parallel_tensor_guid_t label); +TaskImplFunction get_loss_bwd_task_impl(); +TaskSignature get_loss_bwd_signature(); +TaskInvocation + backward(LossAttrs const &, tensor_guid_t logit, tensor_guid_t label); } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/model_training_instance.struct.toml b/lib/local-execution/include/local-execution/model_training_instance.struct.toml new file mode 100644 index 0000000000..ea7e8d24ab --- /dev/null +++ b/lib/local-execution/include/local-execution/model_training_instance.struct.toml @@ -0,0 +1,26 @@ +namespace = "FlexFlow" +name = "ModelTrainingInstance" +features = [ + "eq", + "ord", + "hash", + "fmt", +] + +includes = [ + "utils/optional.h", + "op-attrs/ops/loss_attrs.dtg.h", + "pcg/tensor_guid_t.dtg.h", +] + +[[fields]] +name = "loss_attrs" +type = "::FlexFlow::LossAttrs" + +[[fields]] +name = "label_tensor" +type = "::FlexFlow::tensor_guid_t" + +[[fields]] +name = "logit_tensor" +type = "::FlexFlow::tensor_guid_t" diff --git a/lib/local-execution/include/local-execution/task_arg_spec.variant.toml b/lib/local-execution/include/local-execution/task_arg_spec.variant.toml new file mode 100644 index 0000000000..a6df0c8a7d --- /dev/null +++ b/lib/local-execution/include/local-execution/task_arg_spec.variant.toml @@ -0,0 +1,18 @@ +namespace = "FlexFlow" +name = "TaskArgSpec" +features = [ + "eq" +] + +includes = [ + "local-execution/concrete_arg.h", + "local-execution/runtime_arg_ref.h" +] + +[[values]] +type = "::FlexFlow::ConcreteArgSpec" +key = "concrete_arg_spec" + +[[values]] +type = "::FlexFlow::RuntimeArgRefSpec" +key = "runtime_arg_ref" diff --git a/lib/local-execution/include/local-execution/task_impl_function.variant.toml b/lib/local-execution/include/local-execution/task_impl_function.variant.toml index a12be37da2..1be18bebfa 100644 --- a/lib/local-execution/include/local-execution/task_impl_function.variant.toml +++ b/lib/local-execution/include/local-execution/task_impl_function.variant.toml @@ -10,6 +10,7 @@ features = [ includes = [ "local-execution/init_task_impl_function.h", "local-execution/fwd_bwd_task_impl_function.h", + "local-execution/generic_task_impl_function.h", ] [[values]] @@ -19,3 +20,7 @@ key = "init_task_impl_function" [[values]] type = "::FlexFlow::FwdBwdTaskImplFunction" key = "fwd_bwd_task_impl_function" + +[[values]] +type = "::FlexFlow::GenericTaskImplFunction" +key = "generic_task_impl_function" diff --git a/lib/local-execution/include/local-execution/task_invocation.h b/lib/local-execution/include/local-execution/task_invocation.h new file mode 100644 index 0000000000..2317c65c02 --- /dev/null +++ b/lib/local-execution/include/local-execution/task_invocation.h @@ -0,0 +1,71 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_INVOCATION_H +#define _FLEXFLOW_LOCAL_EXECUTION_TASK_INVOCATION_H + +#include "local-execution/slot_grad_id.dtg.h" +#include "local-execution/slot_id_t.dtg.h" +#include "local-execution/task_arg_spec.dtg.h" +#include "local-execution/task_id_t.dtg.h" +#include "local-execution/task_signature.dtg.h" +#include "local-execution/tensor_guid_spec.dtg.h" + +namespace FlexFlow { + +struct TaskBinding { + TaskBinding() = default; + + void bind(int, TensorGuidSpec const &); + void bind(slot_id_t, TensorGuidSpec const &); + + template + void bind_arg(int name, T const &t) { + this->bind_arg(slot_id_t{name}, t); + } + + template + void bind_arg(slot_id_t name, T const &t) { + this->insert_arg_spec(name, TaskArgSpec{ConcreteArgSpec::create(t)}); + } + + template + void bind_arg(int name, RuntimeArgRef const &t) { + this->bind_arg(slot_id_t{name}, t); + } + + template + void bind_arg(slot_id_t name, RuntimeArgRef const &ref) { + this->insert_arg_spec(name, TaskArgSpec{RuntimeArgRefSpec::create(ref)}); + } + + bool operator==(TaskBinding const &other) const; + bool operator!=(TaskBinding const &other) const; + + std::unordered_map const & + get_tensor_bindings() const; + std::unordered_map const &get_arg_bindings() const; + +private: + std::unordered_map tensor_bindings; + std::unordered_map arg_bindings; + +private: + void insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec); + std::tuple + tie() const; +}; + +struct TaskInvocation { +public: + TaskInvocation() = delete; + TaskInvocation(task_id_t task_id, TaskBinding const &binding) + : task_id(task_id), binding(binding) {} + +public: + task_id_t task_id; + TaskBinding binding; +}; + +bool is_invocation_valid(TaskSignature const &sig, TaskInvocation const &inv); + +} // namespace FlexFlow + +#endif diff --git a/lib/local-execution/include/local-execution/task_signature.h b/lib/local-execution/include/local-execution/task_signature.h new file mode 100644 index 0000000000..d31a67e027 --- /dev/null +++ b/lib/local-execution/include/local-execution/task_signature.h @@ -0,0 +1,57 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_SIGNATURE_H +#define _FLEXFLOW_LOCAL_EXECUTION_TASK_SIGNATURE_H + +// #include "local-execution/tensor_guid_slot_spec.dtg.h" +// #include "local-execution/serialization.h" +// #include "utils/hash/unordered_map.h" +// #include "utils/hash/unordered_set.h" +// #include "utils/type_index.h" + +#include "local-execution/task_signature.dtg.h" + +namespace FlexFlow { + +TaskSignature make_empty_task_signature(); + +void add_slot(TaskSignature &, + int name, + IsGrad, + SlotType slot_type = SlotType::TENSOR); +void add_slot(TaskSignature &, + slot_id_t name, + IsGrad, + SlotType slot_type = SlotType::TENSOR); + +template +void add_arg_slot(TaskSignature &task_signature, int name) { + add_arg_slot(task_signature, slot_id_t{name}); +} + +template +void add_arg_slot(TaskSignature &task_signature, slot_id_t name) { + // static_assert(is_serializable::value, "Type must be serializable"); + task_signature.task_arg_types.insert({name, get_type_index_for_type()}); +} + +template +void add_return_value(TaskSignature &task_signature) { + task_signature.return_value = get_type_index_for_type(); +} + +// adds arg_slot without checking is_serializable, used for arguments that are +// deviceSpecific +template +void add_unchecked_arg_slot(TaskSignature &task_signature, int name) { + add_unchecked_arg_slot(task_signature, slot_id_t{name}); +} + +// adds arg_slot without checking is_serializable, used for arguments that are +// deviceSpecific +template +void add_unchecked_arg_slot(TaskSignature &task_signature, slot_id_t name) { + task_signature.task_arg_types.insert({name, get_type_index_for_type()}); +} + +} // namespace FlexFlow + +#endif diff --git a/lib/local-execution/include/local-execution/task_signature.struct.toml b/lib/local-execution/include/local-execution/task_signature.struct.toml new file mode 100644 index 0000000000..f86f7b0c57 --- /dev/null +++ b/lib/local-execution/include/local-execution/task_signature.struct.toml @@ -0,0 +1,29 @@ +namespace = "FlexFlow" +name = "TaskSignature" +features = [ + "eq", + "fmt", +] + +includes = [ + "local-execution/tensor_guid_slot_spec.dtg.h", + "utils/type_index.h", + "utils/optional.h" +] + +src_includes = [ + "utils/fmt/unordered_map.h", + "utils/fmt/unordered_set.h", +] + +[[fields]] +name = "return_value" +type = "std::optional" + +[[fields]] +name = "task_arg_types" +type = "std::unordered_map<::FlexFlow::slot_id_t, std::type_index>" + +[[fields]] +name = "tensor_guid_slots" +type = "std::unordered_set<::FlexFlow::TensorGuidSlotSpec>" diff --git a/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml b/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml new file mode 100644 index 0000000000..4b3e5b2674 --- /dev/null +++ b/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml @@ -0,0 +1,27 @@ +namespace = "FlexFlow" +name = "TensorGuidSlotSpec" +features = [ + "eq", + "fmt", + "hash", + "ord", +] + +includes = [ + "local-execution/slot_id_t.dtg.h", + "local-execution/slot_type.dtg.h", + "local-execution/is_grad.dtg.h", +] + +[[fields]] +name = "name" +type = "::FlexFlow::slot_id_t" + +[[fields]] +name = "slot_type" +type = "::FlexFlow::SlotType" + +[[fields]] +name = "is_grad" +type = "::FlexFlow::IsGrad" + diff --git a/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml b/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml new file mode 100644 index 0000000000..a51d6ccf1b --- /dev/null +++ b/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml @@ -0,0 +1,22 @@ +namespace = "FlexFlow" +name = "TensorGuidSpec" +features = [ + "eq", + "fmt", + "hash", + "ord" +] + +includes = [ + "pcg/tensor_guid_t.dtg.h", + "local-execution/is_grad.dtg.h", +] + +[[fields]] +name = "tensor_guid" +type = "::FlexFlow::tensor_guid_t" + +[[fields]] +name = "is_grad" +type = "::FlexFlow::IsGrad" + diff --git a/lib/local-execution/src/generic_task_impl_function.cc b/lib/local-execution/src/generic_task_impl_function.cc new file mode 100644 index 0000000000..87d4db53e6 --- /dev/null +++ b/lib/local-execution/src/generic_task_impl_function.cc @@ -0,0 +1,53 @@ +#include "local-execution/generic_task_impl_function.h" + +namespace FlexFlow { + +bool GenericTaskImplFunction::operator==( + GenericTaskImplFunction const &other) const { + return this->function_ptr == other.function_ptr; +} + +bool GenericTaskImplFunction::operator!=( + GenericTaskImplFunction const &other) const { + return this->function_ptr != other.function_ptr; +} + +bool GenericTaskImplFunction::operator<( + GenericTaskImplFunction const &other) const { + return this->function_ptr < other.function_ptr; +} + +bool GenericTaskImplFunction::operator>( + GenericTaskImplFunction const &other) const { + return this->function_ptr > other.function_ptr; +} + +bool GenericTaskImplFunction::operator<=( + GenericTaskImplFunction const &other) const { + return this->function_ptr <= other.function_ptr; +} + +bool GenericTaskImplFunction::operator>=( + GenericTaskImplFunction const &other) const { + return this->function_ptr >= other.function_ptr; +} + +std::string format_as(GenericTaskImplFunction const &x) { + std::ostringstream oss; + oss << ""; + return oss.str(); +} +std::ostream &operator<<(std::ostream &s, GenericTaskImplFunction const &x) { + return s << fmt::to_string(x); +} + +} // namespace FlexFlow + +namespace std { +size_t hash::operator()( + ::FlexFlow::GenericTaskImplFunction const &x) const { + return std::hash{}(x.function_ptr); +} +} // namespace std diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc index d4e0467cbf..1ca422d8e1 100644 --- a/lib/local-execution/src/local_cost_estimator.cc +++ b/lib/local-execution/src/local_cost_estimator.cc @@ -75,7 +75,8 @@ CostDetails LocalCostEstimator::estimate_cost( LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, - this->runtime_arg_config); + this->runtime_arg_config, + std::nullopt); local_backing.execute_init(); PerLayerElapsedTime fwd = local_backing.execute_forward(); diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc index c8d186a0fe..967f8d9ba3 100644 --- a/lib/local-execution/src/local_slots_backing.cc +++ b/lib/local-execution/src/local_slots_backing.cc @@ -15,6 +15,14 @@ void LocalSlotsBacking::add_per_device_op_state( this->per_device_op_states.insert({op_guid, device_state}); } +void LocalSlotsBacking::allocate_label_tensor(tensor_guid_t const &label_tensor, + ComputationGraph const &cg, + Allocator &allocator) { + GenericTensorAccessorW tensor_backing = + allocator.allocate_tensor(get_tensor_attrs(cg, label_tensor).shape); + this->tensor_mapping.insert({label_tensor, tensor_backing}); +} + void LocalSlotsBacking::allocate_outgoing_tensors( layer_guid_t const &layer_guid, ComputationGraph const &computation_graph, @@ -78,7 +86,8 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( TensorSlotsBacking mapping; int num_inputs = 0; for (auto const &tensor_binding : binding.get_tensor_bindings()) { - if (tensor_binding.first.is_grad == IsGrad::NO && tensor_binding.second.role == TensorRole::INPUT) { + if (tensor_binding.first.is_grad == IsGrad::NO && + tensor_binding.second.role == TensorRole::INPUT) { num_inputs += 1; } } @@ -90,7 +99,7 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( int weight_adjusted_idx = 0; switch (tensor_spec.role) { case TensorRole::WEIGHT: - weight_adjusted_idx = num_inputs; + weight_adjusted_idx = num_inputs; case TensorRole::INPUT: assert(contains_key(this->input_tensor_slots, op_guid)); tensor_guids = this->input_tensor_slots.at(op_guid); @@ -106,14 +115,30 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( } IsGrad is_grad = slot_grad_id.is_grad; - GenericTensorAccessorW tensor_backing = - this->get_tensor_backing(tensor_guids.at(weight_adjusted_idx + tensor_spec.idx), is_grad); + GenericTensorAccessorW tensor_backing = this->get_tensor_backing( + tensor_guids.at(weight_adjusted_idx + tensor_spec.idx), is_grad); mapping.insert({slot_grad_id, tensor_backing}); } return mapping; } +TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( + TaskBinding const &binding) const { + TensorSlotsBacking mapping; + + for (auto const &tensor_binding : binding.get_tensor_bindings()) { + SlotGradId slot_grad_id = tensor_binding.first; + TensorGuidSpec tensor_spec = tensor_binding.second; + + GenericTensorAccessorW accessor = + this->get_tensor_backing(tensor_spec.tensor_guid, slot_grad_id.is_grad); + mapping.insert({slot_grad_id, accessor}); + } + + return mapping; +} + ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing( OpTaskBinding const &binding, layer_guid_t const &op_guid) const { ArgSlotsBacking mapping; @@ -135,6 +160,24 @@ ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing( return mapping; } +ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing( + TaskBinding const &binding) const { + ArgSlotsBacking mapping; + for (auto const &arg_binding : binding.get_arg_bindings()) { + slot_id_t arg_slot = arg_binding.first; + TaskArgSpec task_arg_spec = arg_binding.second; + + mapping.insert({arg_slot, + task_arg_spec.visit(overload{ + [&](RuntimeArgRefSpec const &s) { + return this->resolve_runtime_arg_ref_spec(s); + }, + [](ConcreteArgSpec const &s) { return s; }, + })}); + } + return mapping; +} + ConcreteArgSpec LocalSlotsBacking::resolve_op_arg_ref_spec( OpArgRefSpec const &op_arg_ref_spec, layer_guid_t const &op_guid) const { if (op_arg_ref_spec.holds()) { diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index a2ee06a95a..f54d0ddaad 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -1,4 +1,5 @@ #include "local-execution/local_training_backing.h" +#include "local-execution/loss_functions.h" #include "local-execution/task_signature_impl.h" #include "utils/containers/reversed.h" #include "utils/exception.h" @@ -9,10 +10,12 @@ LocalTrainingBacking::LocalTrainingBacking( Allocator const &allocator, ComputationGraph const &computation_graph, TensorBackingMap const &tensor_backing_mapping, - RuntimeArgConfig const &runtime_arg_config) + RuntimeArgConfig const &runtime_arg_config, + std::optional const &training_instance) : allocator(allocator), computation_graph(computation_graph), local_slots_backing(tensor_backing_mapping, runtime_arg_config), - task_registry(empty_task_registry()) { + task_registry(empty_task_registry()), + training_instance(training_instance) { for (layer_guid_t const &node : topological_ordering(computation_graph)) { ComputationGraphOpAttrs attrs = @@ -25,6 +28,13 @@ LocalTrainingBacking::LocalTrainingBacking( // register tasks register_tasks_for_layer(this->task_registry, node, attrs); } + + if (this->training_instance.has_value()) { + this->local_slots_backing.allocate_label_tensor( + this->training_instance.value().label_tensor, + computation_graph, + this->allocator); + } } DeviceSpecificDeviceStates @@ -56,7 +66,7 @@ void LocalTrainingBacking::execute_init() { OpTaskInvocation invocation = init(attrs); TaskArgumentAccessor accessor = - this->get_task_arg_accessor(invocation, operator_node); + this->get_op_task_arg_accessor(invocation, operator_node); DeviceSpecificDeviceStates device_state = this->call_init_task_impl(invocation.task_id, accessor); this->local_slots_backing.add_per_device_op_state(operator_node, @@ -67,6 +77,7 @@ void LocalTrainingBacking::execute_init() { PerLayerElapsedTime LocalTrainingBacking::execute_forward() { PerLayerElapsedTime per_op_elapsed_time; + for (layer_guid_t const &operator_node : topological_ordering(this->computation_graph)) { if (this->task_registry.forward_task_ids.at(operator_node).has_value()) { @@ -75,17 +86,35 @@ PerLayerElapsedTime LocalTrainingBacking::execute_forward() { OpTaskInvocation invocation = forward(attrs); TaskArgumentAccessor accessor = - this->get_task_arg_accessor(invocation, operator_node); + this->get_op_task_arg_accessor(invocation, operator_node); std::optional elapsed_time = this->call_task_impl(invocation.task_id, accessor); per_op_elapsed_time.insert({operator_node, elapsed_time}); } } + return per_op_elapsed_time; } PerLayerElapsedTime LocalTrainingBacking::execute_backward() { PerLayerElapsedTime per_op_elapsed_time; + + // compute loss + if (this->training_instance.has_value()) { + ModelTrainingInstance unwrapped_training_instance = + training_instance.value(); + TaskInvocation loss_invocation = + backward(unwrapped_training_instance.loss_attrs, + unwrapped_training_instance.logit_tensor, + unwrapped_training_instance.label_tensor); + assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation)); + TaskArgumentAccessor loss_accessor = + this->get_task_arg_accessor(loss_invocation); + TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl(); + loss_impl_fn.get().function_ptr(loss_accessor); + } + + // backward through computation graph for (layer_guid_t const &operator_node : reversed(topological_ordering(this->computation_graph))) { if (this->task_registry.backward_task_ids.at(operator_node).has_value()) { @@ -94,7 +123,7 @@ PerLayerElapsedTime LocalTrainingBacking::execute_backward() { OpTaskInvocation invocation = backward(attrs); TaskArgumentAccessor accessor = - this->get_task_arg_accessor(invocation, operator_node); + this->get_op_task_arg_accessor(invocation, operator_node); std::optional elapsed_time = this->call_task_impl(invocation.task_id, accessor); per_op_elapsed_time.insert({operator_node, elapsed_time}); @@ -108,6 +137,17 @@ void LocalTrainingBacking::execute_update() { } TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor( + TaskInvocation const &invocation) const { + TensorSlotsBacking tensor_slots_backing = + this->local_slots_backing.construct_tensor_slots_backing( + invocation.binding); + ArgSlotsBacking arg_slots_backing = + this->local_slots_backing.construct_arg_slots_backing(invocation.binding); + return TaskArgumentAccessor::create( + this->allocator, tensor_slots_backing, arg_slots_backing); +} + +TaskArgumentAccessor LocalTrainingBacking::get_op_task_arg_accessor( OpTaskInvocation const &invocation, layer_guid_t const &op_guid) const { TensorSlotsBacking tensor_slots_backing = this->local_slots_backing.construct_tensor_slots_backing( diff --git a/lib/runtime/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc similarity index 63% rename from lib/runtime/src/loss_functions.cc rename to lib/local-execution/src/loss_functions.cc index b0d5ac2029..6b23d5da51 100644 --- a/lib/runtime/src/loss_functions.cc +++ b/lib/local-execution/src/loss_functions.cc @@ -13,56 +13,44 @@ * limitations under the License. */ -#include "loss_functions.h" +#include "op-attrs/ops/loss_functions.h" #include "kernels/loss_function_kernels.h" -#include "legion.h" -#include "runtime/profiling.h" -#include "task_spec/task_argument_accessor.h" +#include "local-execution/loss_functions.h" +#include "local-execution/profiling.h" namespace FlexFlow { -enum LossSlots { - LOGIT_GRAD, - LOGIT, - LABEL, - LOSS_ATTRS, - BATCH_SIZE, - PROFILING_SETTINGS -}; +enum Slots { LOGIT, LABEL, ATTRS, PROFILING }; -TaskInvocation backward_invocation(LossAttrs const &attrs, - EnableProfiling enable_profiling, - parallel_tensor_guid_t logit, - parallel_tensor_guid_t label) { - auto binding = IndexTaskBinding{LOGIT}; - StandardTypedTaskArg arg = attrs; - binding.bind_arg(LOSS_ATTRS, attrs); - binding.bind(LOGIT, logit); - binding.bind(LABEL, label); - binding.bind(LOGIT_GRAD, grad(logit)); - binding.bind_arg(PROFILING_SETTINGS, profiling_settings()); +TaskSignature get_loss_bwd_signature() { + TaskSignature sig = make_empty_task_signature(); + add_slot(sig, LOGIT, IsGrad::NO); + add_slot(sig, LABEL, IsGrad::NO); + add_slot(sig, LOGIT, IsGrad::YES); + add_arg_slot(sig, ATTRS); + add_arg_slot(sig, PROFILING); + return sig; +} + +TaskInvocation + backward(LossAttrs const &attrs, tensor_guid_t logit, tensor_guid_t label) { + TaskBinding b; + b.bind(LOGIT, TensorGuidSpec{logit, IsGrad::NO}); + b.bind(LABEL, TensorGuidSpec{label, IsGrad::NO}); + b.bind(LOGIT, TensorGuidSpec{logit, IsGrad::YES}); + b.bind_arg(ATTRS, attrs); + b.bind_arg(PROFILING, profiling_settings()); - /* if ((logit_domain != part_domain) || (label_domain != part_domain)) { */ // TODO @lockshaw make sure this is still checked - /* fprintf(stderr, */ - /* "Encounter inconsistency in parallelizing loss computation"); */ - /* assert(false); */ - /* } */ - return {LOSS_BWD_TASK_ID, binding}; + return {task_id_t::LOSS_BWD_TASK_ID, b}; } -static void - loss_backward_task(Legion::Task const *task, - std::vector const ®ions, - Legion::Context ctx, - Legion::Runtime *runtime) { - TaskArgumentAccessor acc(task, regions, ctx, runtime); - auto attrs = acc.get_argument(LOSS_ATTRS); - auto profiling_settings = - acc.get_argument(PROFILING_SETTINGS); - auto batch_size = acc.get_argument(BATCH_SIZE); - auto logit_grad = acc.get_tensor(LOGIT_GRAD); +static void backward_task_impl(TaskArgumentAccessor const &acc) { + auto attrs = acc.get_argument(ATTRS); + auto profiling = acc.get_argument(PROFILING); + auto logit_grad = acc.get_tensor_grad(LOGIT); auto logit = acc.get_tensor(LOGIT); auto label = acc.get_tensor(LABEL); + int batch_size = label.shape.at(ff_dim_t{0}); LossFunction loss_type = get_loss_function(attrs); float scale_factor = 1.0f / batch_size; @@ -73,7 +61,7 @@ static void if (loss_type == LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY) { // assertion the outter-most dim is replica dim and replica degree is 1 - auto scce_attrs = get(attrs); + auto scce_attrs = attrs.get(); size_t ndim = logit.shape.num_dims(); assert(logit.shape.at(legion_dim_t(ndim - 1)) == 1); int num_samples = logit.shape.at(legion_dim_t(ndim - 2)); @@ -86,19 +74,19 @@ static void ndim - 1)); // TODO FIXME something seems wrong here, isn't the // numerator guaranteed to be 1? } - assert(label.shape.sub_shape(legion_dim_t(1), nullopt) == - logit.shape.sub_shape(legion_dim_t(1), nullopt)); + assert(label.shape.sub_shape(legion_dim_t(1), std::nullopt) == + logit.shape.sub_shape(legion_dim_t(1), std::nullopt)); assert(k * label.shape.at(legion_dim_t(ndim - 1)) == logit.shape.at(legion_dim_t(ndim - 1))); assert(label.shape.at(legion_dim_t(0)) == 1); profile(sparse_categorical_crossentropy_loss_backward_kernel, - profiling_settings, + profiling, "[SparseCategoricalCrossEntropyLoss] backward_time = %.2lfms\n", get_float_ptr(logit_grad), get_float_ptr(logit), get_int32_ptr(label), - logit.shape.get_volume(), + get_volume(logit.shape), get_volume(logit_grad.shape), num_samples, num_classes, @@ -115,7 +103,7 @@ static void switch (loss_type) { case LossFunction::CATEGORICAL_CROSSENTROPY: { profile(categorical_crossentropy_loss_backward_kernel, - profiling_settings, + profiling, "[CategoricalCrossEntropyLoss] backward_time = %.2lfms\n", get_float_ptr(logit_grad), get_float_ptr(logit), @@ -127,7 +115,7 @@ static void } case LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE: { profile(mean_squared_error_avg_loss_backward_kernel, - profiling_settings, + profiling, "[MeanSquaredErrorAvgLoss] backward_time = %.2lfms\n", get_float_ptr(logit_grad), get_float_ptr(logit), @@ -139,7 +127,7 @@ static void } case LossFunction::IDENTITY: { profile(identity_loss_backward_kernel, - profiling_settings, + profiling, "[IdentityLoss] backward_time = %.2lfms\n", get_float_ptr(logit_grad), get_float_ptr(logit), @@ -156,16 +144,8 @@ static void } } -template <> -void register_task() { - TaskSignature sig; - sig.add_arg_slot(LOSS_ATTRS); - sig.add_arg_slot(PROFILING_SETTINGS); - sig.add_slot(LOGIT, {SlotType::TENSOR, Permissions::RO}); - sig.add_slot(LABEL, {SlotType::TENSOR, Permissions::RO}); - sig.add_slot(LOGIT_GRAD, {SlotType::TENSOR, Permissions::RW}); - - register_task(LOSS_BWD_TASK_ID, "Loss Backward", sig, loss_backward_task); +TaskImplFunction get_loss_bwd_task_impl() { + return TaskImplFunction{GenericTaskImplFunction{backward_task_impl}}; } } // namespace FlexFlow diff --git a/lib/local-execution/src/ops/attention.cc b/lib/local-execution/src/ops/attention.cc index eebef9039d..8ede2cb38b 100644 --- a/lib/local-execution/src/ops/attention.cc +++ b/lib/local-execution/src/ops/attention.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "local-execution/ops/attention.h" +#include "attention.h" #include "kernels/attention_kernels.h" #include "local-execution/op_task_signature.h" #include "op-attrs/ops/attention.h" diff --git a/lib/local-execution/include/local-execution/ops/attention.h b/lib/local-execution/src/ops/attention.h similarity index 100% rename from lib/local-execution/include/local-execution/ops/attention.h rename to lib/local-execution/src/ops/attention.h diff --git a/lib/local-execution/src/task_invocation.cc b/lib/local-execution/src/task_invocation.cc new file mode 100644 index 0000000000..c64af5332e --- /dev/null +++ b/lib/local-execution/src/task_invocation.cc @@ -0,0 +1,49 @@ +#include "local-execution/task_invocation.h" +#include "utils/containers/contains_key.h" + +namespace FlexFlow { + +void TaskBinding::bind(int name, TensorGuidSpec const &tensor_guid_spec) { + this->bind(slot_id_t{name}, tensor_guid_spec); +} + +void TaskBinding::bind(slot_id_t name, TensorGuidSpec const &tensor_guid_spec) { + this->tensor_bindings.insert( + {SlotGradId{name, tensor_guid_spec.is_grad}, tensor_guid_spec}); +} + +void TaskBinding::insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec) { + assert(!contains_key(this->arg_bindings, name)); + this->arg_bindings.insert({name, arg_spec}); +} + +bool TaskBinding::operator==(TaskBinding const &other) const { + return this->tie() == other.tie(); +} + +bool TaskBinding::operator!=(TaskBinding const &other) const { + return this->tie() != other.tie(); +} + +std::tuple const &, + std::unordered_map const &> + TaskBinding::tie() const { + return std::tie(this->tensor_bindings, this->arg_bindings); +} + +std::unordered_map const & + TaskBinding::get_tensor_bindings() const { + return this->tensor_bindings; +} + +std::unordered_map const & + TaskBinding::get_arg_bindings() const { + return this->arg_bindings; +} + +bool is_invocation_valid(TaskSignature const &sig, TaskInvocation const &inv) { + // TODO: implement signature checking + return true; +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/task_signature.cc b/lib/local-execution/src/task_signature.cc new file mode 100644 index 0000000000..3bba9e2c8a --- /dev/null +++ b/lib/local-execution/src/task_signature.cc @@ -0,0 +1,25 @@ +#include "local-execution/task_signature.h" + +namespace FlexFlow { + +TaskSignature make_empty_task_signature() { + return TaskSignature(std::nullopt, {}, {}); +} + +void add_slot(TaskSignature &task_signature, + int name, + IsGrad is_grad, + SlotType slot_type) { + add_slot(task_signature, slot_id_t{name}, is_grad, slot_type); +} + +void add_slot(TaskSignature &task_signature, + slot_id_t name, + IsGrad is_grad, + SlotType slot_type) { + TensorGuidSlotSpec tensor_guid_slot_spec = + TensorGuidSlotSpec{name, slot_type, is_grad}; + task_signature.tensor_guid_slots.insert(tensor_guid_slot_spec); +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/task_signature_impl.cc b/lib/local-execution/src/task_signature_impl.cc index ca428aad25..16b7870601 100644 --- a/lib/local-execution/src/task_signature_impl.cc +++ b/lib/local-execution/src/task_signature_impl.cc @@ -1,5 +1,5 @@ #include "local-execution/task_signature_impl.h" -#include "local-execution/ops/attention.h" +#include "ops/attention.h" #include "ops/batch_matmul.h" #include "ops/batch_norm.h" #include "ops/cast.h" diff --git a/lib/local-execution/test/src/test_task_registry.cc b/lib/local-execution/test/src/test_task_registry.cc index fa3b068425..2c3a6c1d63 100644 --- a/lib/local-execution/test/src/test_task_registry.cc +++ b/lib/local-execution/test/src/test_task_registry.cc @@ -1,7 +1,6 @@ #include "doctest/doctest.h" #include "kernels/local_cuda_allocator.h" #include "local-execution/local_cost_estimator.h" -#include "local-execution/ops/attention.h" #include "local-execution/task_signature_impl.h" #include "pcg/computation_graph_builder.h" #include "utils/fmt/optional.h" diff --git a/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml b/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml new file mode 100644 index 0000000000..8a4f38839c --- /dev/null +++ b/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml @@ -0,0 +1,22 @@ +namespace = "FlexFlow" +name = "LossAttrs" +features = [ + "eq", + "ord", + "hash", + "json", + "fmt", +] + +includes = [ + "op-attrs/ops/sparse_categorical_ce_loss_attrs.dtg.h", + "op-attrs/ops/other_loss_attrs.dtg.h" +] + +[[values]] +type = "::FlexFlow::SparseCategoricalCrossEntropyLossAttrs" +key = "sparse_categorical_ce_loss_attrs" + +[[values]] +type = "::FlexFlow::OtherLossAttrs" +key = "other_loss_attrs" diff --git a/lib/op-attrs/include/op-attrs/ops/loss_function.enum.toml b/lib/op-attrs/include/op-attrs/ops/loss_function.enum.toml new file mode 100644 index 0000000000..b9cd13eabf --- /dev/null +++ b/lib/op-attrs/include/op-attrs/ops/loss_function.enum.toml @@ -0,0 +1,23 @@ +namespace = "FlexFlow" +name = "LossFunction" +features = [ + "hash", + "json", + "rapidcheck", + "fmt", +] + +[[values]] +name = "CATEGORICAL_CROSSENTROPY" + +[[values]] +name = "SPARSE_CATEGORICAL_CROSSENTROPY" + +[[values]] +name = "MEAN_SQUARED_ERROR_AVG_REDUCE" + +[[values]] +name = "MEAN_SQUARED_ERROR_SUM_REDUCE" + +[[values]] +name = "IDENTITY" diff --git a/lib/op-attrs/include/op-attrs/ops/loss_functions.h b/lib/op-attrs/include/op-attrs/ops/loss_functions.h index 58d372d9e5..9fb0597197 100644 --- a/lib/op-attrs/include/op-attrs/ops/loss_functions.h +++ b/lib/op-attrs/include/op-attrs/ops/loss_functions.h @@ -2,74 +2,16 @@ #define _FLEXFLOW_OP_ATTRS_INCLUDE_OP_ATTRS_OPS_LOSS_FUNCTIONS_H #include "core.h" -#include "utils/exception.h" -#include "utils/visitable.h" -#include +#include "loss_attrs.dtg.h" +#include "loss_function.dtg.h" +#include "other_loss_attrs.dtg.h" +#include "sparse_categorical_ce_loss_attrs.dtg.h" namespace FlexFlow { -enum class LossFunction { - CATEGORICAL_CROSSENTROPY, - SPARSE_CATEGORICAL_CROSSENTROPY, - MEAN_SQUARED_ERROR_AVG_REDUCE, - MEAN_SQUARED_ERROR_SUM_REDUCE, - IDENTITY -}; - -LossFunction parse_loss_function_name(std::string const &); - -struct SparseCategoricalCrossEntropyLossAttrs { - req replace_labels; // for aggregate_spec: More predictions than labels -}; -FF_VISITABLE_STRUCT(SparseCategoricalCrossEntropyLossAttrs, replace_labels); -CHECK_VALID_OP_ATTR(SparseCategoricalCrossEntropyLossAttrs); - -struct OtherLossAttrs { - req loss_type; -}; -FF_VISITABLE_STRUCT(OtherLossAttrs, loss_type); -CHECK_VALID_OP_ATTR(OtherLossAttrs); - -using LossAttrs = - std::variant; - -LossFunction get_loss_function(OtherLossAttrs const &); -LossFunction get_loss_function(SparseCategoricalCrossEntropyLossAttrs const &); LossFunction get_loss_function(LossAttrs const &); +LossFunction parse_loss_name(std::string const &raw_name); } // namespace FlexFlow -namespace fmt { - -template <> -struct formatter<::FlexFlow::LossFunction> : formatter { - template - auto format(::FlexFlow::LossFunction d, FormatContext &ctx) const - -> decltype(ctx.out()) { - using namespace FlexFlow; - - string_view name = "unknown"; - switch (d) { - case LossFunction::CATEGORICAL_CROSSENTROPY: - name = "CategoricalCrossEntropy"; - break; - case LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY: - name = "SparseCategoricalCrossEntropy"; - break; - case LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE: - name = "MeanSquaredErrorAvgReduce"; - break; - case LossFunction::MEAN_SQUARED_ERROR_SUM_REDUCE: - name = "MeanSquaredErrorSumReduce"; - break; - case LossFunction::IDENTITY: - name = "Identity"; - break; - } - return formatter::format(name, ctx); - } -}; - -} // namespace fmt - #endif diff --git a/lib/op-attrs/include/op-attrs/ops/other_loss_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/other_loss_attrs.struct.toml new file mode 100644 index 0000000000..81055f5835 --- /dev/null +++ b/lib/op-attrs/include/op-attrs/ops/other_loss_attrs.struct.toml @@ -0,0 +1,18 @@ +namespace = "FlexFlow" +name = "OtherLossAttrs" +features = [ + "eq", + "ord", + "hash", + "json", + "rapidcheck", + "fmt", +] + +includes = [ + "op-attrs/ops/loss_function.dtg.h" +] + +[[fields]] +name = "loss_type" +type = "::FlexFlow::LossFunction" diff --git a/lib/op-attrs/include/op-attrs/ops/sparse_categorical_ce_loss_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/sparse_categorical_ce_loss_attrs.struct.toml new file mode 100644 index 0000000000..21378a1154 --- /dev/null +++ b/lib/op-attrs/include/op-attrs/ops/sparse_categorical_ce_loss_attrs.struct.toml @@ -0,0 +1,14 @@ +namespace = "FlexFlow" +name = "SparseCategoricalCrossEntropyLossAttrs" +features = [ + "eq", + "ord", + "hash", + "json", + "rapidcheck", + "fmt", +] + +[[fields]] +name = "replace_labels" +type = "bool" diff --git a/lib/op-attrs/src/loss_functions.cc b/lib/op-attrs/src/loss_functions.cc index 094e117d77..cae88be453 100644 --- a/lib/op-attrs/src/loss_functions.cc +++ b/lib/op-attrs/src/loss_functions.cc @@ -1,27 +1,18 @@ #include "op-attrs/ops/loss_functions.h" #include "utils/containers/transform.h" +#include "utils/exception.h" +#include "utils/overload.h" #include #include namespace FlexFlow { -LossFunction get_loss_type(OtherLossAttrs const &attrs) { - return attrs.loss_type; -} -LossFunction - get_loss_type(SparseCategoricalCrossEntropyLossAttrs const &attrs) { - return LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY; -} - -struct GetLossFunction { - template - LossFunction operator()(T const &t) { - return get_loss_type(t); - } -}; - -LossFunction get_loss_type(LossAttrs const &attrs) { - return visit(GetLossFunction{}, attrs); +LossFunction get_loss_function(LossAttrs const &attrs) { + return attrs.visit( + overload{[&](SparseCategoricalCrossEntropyLossAttrs const &s) { + return LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY; + }, + [&](OtherLossAttrs const &s) { return s.loss_type; }}); } LossFunction parse_loss_name(std::string const &raw_name) { From b56c046b3bc44586bae96b59476b6c384f922837 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 27 Aug 2024 04:58:33 -0700 Subject: [PATCH 03/91] Add cuda test for loss function --- lib/kernels/CMakeLists.txt | 2 +- lib/kernels/include/kernels/array_shape.h | 10 ++- lib/kernels/src/array_shape.cc | 36 +++++++- lib/kernels/src/cuda/cuda_helper.cu | 6 ++ lib/kernels/src/device.h | 1 + .../local-execution/local_slots_backing.h | 4 - .../src/local_slots_backing.cc | 8 -- .../src/local_training_backing.cc | 11 ++- lib/local-execution/src/loss_functions.cc | 22 ++--- lib/local-execution/src/ops/element_unary.cc | 8 +- .../src/task_signature_impl.cc | 4 +- .../test/src/test_loss_function.cc | 88 +++++++++++++++++++ 12 files changed, 159 insertions(+), 41 deletions(-) create mode 100644 lib/local-execution/test/src/test_loss_function.cc diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt index 54fa3c9583..baac58f8e3 100644 --- a/lib/kernels/CMakeLists.txt +++ b/lib/kernels/CMakeLists.txt @@ -8,7 +8,7 @@ file(GLOB_RECURSE SRC LIST_DIRECTORIES False src/*.cc src/cuda/cuda_helper.cu - src/cuda/loss_functions_kernels.cu + src/cuda/loss_function_kernels.cu src/cuda/ops/*.cu ) diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h index 5de9fae7ad..c95c447574 100644 --- a/lib/kernels/include/kernels/array_shape.h +++ b/lib/kernels/include/kernels/array_shape.h @@ -42,9 +42,13 @@ struct ArrayShape { std::optional at_maybe(legion_dim_t) const; std::optional at_maybe(ff_dim_t) const; - ArrayShape - sub_shape(std::optional> start, - std::optional> end) const; + ArrayShape sub_shape(legion_dim_t start, ff_dim_t end) const; + + ArrayShape sub_shape(std::optional start, + std::optional end) const; + + ArrayShape sub_shape(std::optional start, + std::optional end) const; public: LegionTensorDims dims; diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index d5e2f1167d..bf80c6b5c1 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -50,12 +50,42 @@ std::size_t ArrayShape::at(ff_dim_t idx) const { return dims.at(legion_dim_from_ff_dim(idx, this->num_dims())); } -ArrayShape ArrayShape::sub_shape( - std::optional> start, - std::optional> end) const { +// ArrayShape ArrayShape::sub_shape( +// std::optional> start, +// std::optional> end) const { +// NOT_IMPLEMENTED(); +// } + +ArrayShape ArrayShape::sub_shape(legion_dim_t start, ff_dim_t end) const { NOT_IMPLEMENTED(); } +ArrayShape ArrayShape::sub_shape(std::optional start, + std::optional end) const { + std::vector new_shape; + ff_dim_t start_idx = start.value_or(ff_dim_t{0}); + ff_dim_t end_idx = end.value_or(ff_dim_t{this->num_dims()}); + + while (start_idx < end_idx) { + new_shape.push_back(this->at(start_idx)); + start_idx = ff_dim_t{start_idx.value + 1}; + } + return ArrayShape{new_shape}; +} + +ArrayShape ArrayShape::sub_shape(std::optional start, + std::optional end) const { + std::vector new_shape; + legion_dim_t start_idx = start.value_or(legion_dim_t{0}); + legion_dim_t end_idx = end.value_or(legion_dim_t{this->num_dims()}); + + while (start_idx < end_idx) { + new_shape.push_back(this->at(start_idx)); + start_idx = add_to_legion_dim(start_idx, 1); + } + return ArrayShape{new_shape}; +} + std::optional ArrayShape::at_maybe(legion_dim_t index) const { if (index.value < dims.size()) { return dims.at(index); diff --git a/lib/kernels/src/cuda/cuda_helper.cu b/lib/kernels/src/cuda/cuda_helper.cu index 2ff02038f4..5a303ca15e 100644 --- a/lib/kernels/src/cuda/cuda_helper.cu +++ b/lib/kernels/src/cuda/cuda_helper.cu @@ -35,6 +35,12 @@ __global__ void scale_kernel(float *ptr, coord_t size, float a, float b) { } } +__global__ void scale_kernel(float *ptr, unsigned long size, float a, float b) { + CUDA_KERNEL_LOOP(i, size) { + ptr[i] = (b - a) * ptr[i] + a; + } +} + __global__ void ones_kernel(float *ptr, coord_t size) { CUDA_KERNEL_LOOP(i, size) { ptr[i] = 1.0f; diff --git a/lib/kernels/src/device.h b/lib/kernels/src/device.h index ceff2f92ff..e32805fde3 100644 --- a/lib/kernels/src/device.h +++ b/lib/kernels/src/device.h @@ -71,6 +71,7 @@ inline int GET_BLOCKS(int const N) { } __global__ void scale_kernel(float *ptr, size_t size, float a, float b); +__global__ void scale_kernel(float *ptr, unsigned long size, float a, float b); __global__ void ones_kernel(float *ptr, size_t size); diff --git a/lib/local-execution/include/local-execution/local_slots_backing.h b/lib/local-execution/include/local-execution/local_slots_backing.h index 312a13cc01..1f35bdd304 100644 --- a/lib/local-execution/include/local-execution/local_slots_backing.h +++ b/lib/local-execution/include/local-execution/local_slots_backing.h @@ -20,9 +20,6 @@ struct LocalSlotsBacking { public: void add_per_device_op_state(layer_guid_t const &, DeviceSpecificDeviceStates const &); - void allocate_label_tensor(tensor_guid_t const &, - ComputationGraph const &, - Allocator &); void allocate_outgoing_tensors(layer_guid_t const &, ComputationGraph const &, Allocator &); @@ -40,7 +37,6 @@ struct LocalSlotsBacking { GenericTensorAccessorW const &get_tensor_backing(tensor_guid_t const &, IsGrad) const; -private: bool is_tensor_allocated(tensor_guid_t const &) const; bool is_gradient_tensor_allocated(tensor_guid_t const &) const; diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc index 967f8d9ba3..787c7dda86 100644 --- a/lib/local-execution/src/local_slots_backing.cc +++ b/lib/local-execution/src/local_slots_backing.cc @@ -15,14 +15,6 @@ void LocalSlotsBacking::add_per_device_op_state( this->per_device_op_states.insert({op_guid, device_state}); } -void LocalSlotsBacking::allocate_label_tensor(tensor_guid_t const &label_tensor, - ComputationGraph const &cg, - Allocator &allocator) { - GenericTensorAccessorW tensor_backing = - allocator.allocate_tensor(get_tensor_attrs(cg, label_tensor).shape); - this->tensor_mapping.insert({label_tensor, tensor_backing}); -} - void LocalSlotsBacking::allocate_outgoing_tensors( layer_guid_t const &layer_guid, ComputationGraph const &computation_graph, diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index f54d0ddaad..98bfe7683e 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -1,6 +1,8 @@ #include "local-execution/local_training_backing.h" #include "local-execution/loss_functions.h" #include "local-execution/task_signature_impl.h" +#include "utils/containers/contains.h" +#include "utils/containers/contains_key.h" #include "utils/containers/reversed.h" #include "utils/exception.h" @@ -30,10 +32,11 @@ LocalTrainingBacking::LocalTrainingBacking( } if (this->training_instance.has_value()) { - this->local_slots_backing.allocate_label_tensor( - this->training_instance.value().label_tensor, - computation_graph, - this->allocator); + // label and logit tensor should be allocated + assert(this->local_slots_backing.is_tensor_allocated( + this->training_instance.value().label_tensor)); + assert(this->local_slots_backing.is_tensor_allocated( + this->training_instance.value().logit_tensor)); } } diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc index 6b23d5da51..771d175a7d 100644 --- a/lib/local-execution/src/loss_functions.cc +++ b/lib/local-execution/src/loss_functions.cc @@ -50,7 +50,8 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { auto logit_grad = acc.get_tensor_grad(LOGIT); auto logit = acc.get_tensor(LOGIT); auto label = acc.get_tensor(LABEL); - int batch_size = label.shape.at(ff_dim_t{0}); + int batch_size = logit.shape.at(legion_dim_t{1}); + // assuming logit shape is [parallel dim(?), batch dim, num classes] LossFunction loss_type = get_loss_function(attrs); float scale_factor = 1.0f / batch_size; @@ -60,19 +61,18 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { } if (loss_type == LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY) { - // assertion the outter-most dim is replica dim and replica degree is 1 + // label shape is [parallel dim(?), batch dim, 1] auto scce_attrs = attrs.get(); size_t ndim = logit.shape.num_dims(); - assert(logit.shape.at(legion_dim_t(ndim - 1)) == 1); - int num_samples = logit.shape.at(legion_dim_t(ndim - 2)); - int num_classes = logit.shape.get_volume() / num_samples; + int num_classes = logit.shape.at(legion_dim_t{0}); assert(logit_grad.shape == logit.shape); int k = 1; if (scce_attrs.replace_labels) { k = logit.shape.at(legion_dim_t(ndim - 1)) / label.shape.at(legion_dim_t( ndim - 1)); // TODO FIXME something seems wrong here, isn't the - // numerator guaranteed to be 1? + // numerator guaranteed to be 1? <--- this is not the + // case because of the potential parallel dim } assert(label.shape.sub_shape(legion_dim_t(1), std::nullopt) == logit.shape.sub_shape(legion_dim_t(1), std::nullopt)); @@ -85,21 +85,17 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { "[SparseCategoricalCrossEntropyLoss] backward_time = %.2lfms\n", get_float_ptr(logit_grad), get_float_ptr(logit), - get_int32_ptr(label), + reinterpret_cast(get_float_ptr(label)), get_volume(logit.shape), get_volume(logit_grad.shape), - num_samples, + batch_size, num_classes, k, scale_factor); } else { assert(logit.shape == label.shape); assert(logit_grad.shape == logit.shape); - // assertion the outter-most dim is replica dim and replica degree is 1 - size_t ndim = logit.shape.num_dims(); - assert(logit.shape.at(legion_dim_t(ndim - 1)) == 1); - int num_samples = label.shape.at(legion_dim_t(ndim - 1)); - int num_channels = logit.shape.get_volume() / num_samples; + int num_channels = logit.shape.at(legion_dim_t{0}); switch (loss_type) { case LossFunction::CATEGORICAL_CROSSENTROPY: { profile(categorical_crossentropy_loss_backward_kernel, diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc index a52ebb8089..502afb5f9f 100644 --- a/lib/local-execution/src/ops/element_unary.cc +++ b/lib/local-execution/src/ops/element_unary.cc @@ -34,7 +34,9 @@ OpTaskInvocation forward(ElementUnaryAttrs const &attrs) { b.bind(INPUT, input_tensor(0)); b.bind(OUTPUT, output_tensor(0)); + b.bind_arg(ATTRS, attrs); + b.bind_arg(HANDLE, ff_handle()); b.bind_arg(PROFILING, profiling_settings()); b.bind_arg(PER_DEVICE_STATE, per_device_op_state()); @@ -51,8 +53,8 @@ OpTaskInvocation backward(ElementUnaryAttrs const &attrs) { static DeviceSpecificDeviceStates init_task_impl(TaskArgumentAccessor const &acc) { - auto const &attrs = acc.get_argument(ATTRS); - ProfilingSettings profiling = acc.get_argument(PROFILING); + auto attrs = acc.get_argument(ATTRS); + ParallelTensorShape input_shape = acc.get_argument(INPUT_SHAPE); @@ -68,7 +70,7 @@ static DeviceSpecificDeviceStates static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); - auto const &attrs = acc.get_argument(ATTRS); + auto attrs = acc.get_argument(ATTRS); auto handle = acc.get_argument(HANDLE); diff --git a/lib/local-execution/src/task_signature_impl.cc b/lib/local-execution/src/task_signature_impl.cc index 16b7870601..3072b9a8bd 100644 --- a/lib/local-execution/src/task_signature_impl.cc +++ b/lib/local-execution/src/task_signature_impl.cc @@ -50,8 +50,8 @@ TaskSignatureAndImpl get_task_sig_impl(task_id_t const &task_id) { return TaskSignatureAndImpl{get_element_unary_fwd_task_impl(), get_element_unary_fwd_signature()}; case task_id_t::ELEMENTUNARY_BWD_TASK_ID: - return TaskSignatureAndImpl{get_element_binary_bwd_task_impl(), - get_element_binary_bwd_signature()}; + return TaskSignatureAndImpl{get_element_unary_bwd_task_impl(), + get_element_unary_bwd_signature()}; case task_id_t::CONV2D_INIT_TASK_ID: return TaskSignatureAndImpl{get_conv_2d_init_task_impl(), get_conv_2d_init_signature()}; diff --git a/lib/local-execution/test/src/test_loss_function.cc b/lib/local-execution/test/src/test_loss_function.cc new file mode 100644 index 0000000000..73ab02646e --- /dev/null +++ b/lib/local-execution/test/src/test_loss_function.cc @@ -0,0 +1,88 @@ +#include "doctest/doctest.h" +#include "kernels/local_cuda_allocator.h" +#include "kernels/managed_per_device_ff_handle.h" +#include "kernels/managed_ff_stream.h" +#include "pcg/computation_graph_builder.h" +#include "test_utils.h" +#include "local-execution/local_training_backing.h" + +namespace FlexFlow { + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("Loss Function Local Execution") { + // initialize runtime configs + ManagedPerDeviceFFHandle managed_handle{}; + + RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ + DeviceSpecific::create(managed_handle.raw_handle()), + EnableProfiling::NO, + ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/0} + }; + + // construct graph + ComputationGraphBuilder cg_builder; + + size_t batch_size = 10; + size_t data_dim = 100; + TensorShape input_shape = TensorShape{TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; + tensor_guid_t input_tensor = cg_builder.create_tensor(input_shape, CreateGrad::YES); + + float scalar = 4.0; + tensor_guid_t logit_tensor = cg_builder.scalar_multiply(input_tensor, scalar); + + // allocate memory + Allocator allocator = create_local_cuda_memory_allocator(); + TensorBackingMap tensor_backing_map; + GenericTensorAccessorW input_backing = allocator.allocate_tensor(input_shape); + tensor_backing_map.insert({input_tensor, input_backing}); + + SUBCASE("SparseCategoricalCrossEntropyLossAttrs") { + TensorShape label_shape = TensorShape{TensorDims{FFOrdered{batch_size, 1}}, DataType::FLOAT}; + tensor_guid_t label_tensor = cg_builder.create_tensor(label_shape, CreateGrad::NO); + GenericTensorAccessorW label_backing = allocator.allocate_tensor(label_shape); + tensor_backing_map.insert({label_tensor, label_backing}); + ModelTrainingInstance model_training_instance = ModelTrainingInstance{LossAttrs{SparseCategoricalCrossEntropyLossAttrs{/*replace_labels=*/false}}, + label_tensor, logit_tensor}; + LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, model_training_instance); + local_backing.execute_init(); + local_backing.execute_forward(); + local_backing.execute_backward(); + } + + SUBCASE("OtherAttrs") { + tensor_guid_t label_tensor = cg_builder.create_tensor(input_shape, CreateGrad::NO); + GenericTensorAccessorW label_backing = allocator.allocate_tensor(input_shape); + tensor_backing_map.insert({label_tensor, label_backing}); + + SUBCASE("LossFunction::CATEGORICAL_CROSSENTROPY") { + ModelTrainingInstance model_training_instance = ModelTrainingInstance{LossAttrs{OtherLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}, + label_tensor, logit_tensor}; + LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, model_training_instance); + local_backing.execute_init(); + local_backing.execute_forward(); + local_backing.execute_backward(); + } + + SUBCASE("LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE") { + ModelTrainingInstance model_training_instance = ModelTrainingInstance{LossAttrs{OtherLossAttrs{LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, + label_tensor, logit_tensor}; + LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, model_training_instance); + local_backing.execute_init(); + local_backing.execute_forward(); + local_backing.execute_backward(); + } + + SUBCASE("LossFunction::IDENTITY") { + ModelTrainingInstance model_training_instance = ModelTrainingInstance{LossAttrs{OtherLossAttrs{LossFunction::IDENTITY}}, + label_tensor, logit_tensor}; + LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, model_training_instance); + local_backing.execute_init(); + local_backing.execute_forward(); + local_backing.execute_backward(); + } + + } + } +} + +} // namespace FlexFlow From f75a3d4c1cc85ae60d6254ddcabbb40b6f2338ad Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 27 Aug 2024 05:00:16 -0700 Subject: [PATCH 04/91] Format --- lib/kernels/src/array_shape.cc | 6 -- .../test/src/test_loss_function.cc | 89 +++++++++++++------ 2 files changed, 61 insertions(+), 34 deletions(-) diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index bf80c6b5c1..69f04d6d34 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -50,12 +50,6 @@ std::size_t ArrayShape::at(ff_dim_t idx) const { return dims.at(legion_dim_from_ff_dim(idx, this->num_dims())); } -// ArrayShape ArrayShape::sub_shape( -// std::optional> start, -// std::optional> end) const { -// NOT_IMPLEMENTED(); -// } - ArrayShape ArrayShape::sub_shape(legion_dim_t start, ff_dim_t end) const { NOT_IMPLEMENTED(); } diff --git a/lib/local-execution/test/src/test_loss_function.cc b/lib/local-execution/test/src/test_loss_function.cc index 73ab02646e..9e60c1b979 100644 --- a/lib/local-execution/test/src/test_loss_function.cc +++ b/lib/local-execution/test/src/test_loss_function.cc @@ -1,10 +1,10 @@ #include "doctest/doctest.h" #include "kernels/local_cuda_allocator.h" -#include "kernels/managed_per_device_ff_handle.h" #include "kernels/managed_ff_stream.h" +#include "kernels/managed_per_device_ff_handle.h" +#include "local-execution/local_training_backing.h" #include "pcg/computation_graph_builder.h" #include "test_utils.h" -#include "local-execution/local_training_backing.h" namespace FlexFlow { @@ -14,73 +14,106 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { ManagedPerDeviceFFHandle managed_handle{}; RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ - DeviceSpecific::create(managed_handle.raw_handle()), - EnableProfiling::NO, - ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/0} - }; + DeviceSpecific::create(managed_handle.raw_handle()), + EnableProfiling::NO, + ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/0}}; // construct graph ComputationGraphBuilder cg_builder; size_t batch_size = 10; size_t data_dim = 100; - TensorShape input_shape = TensorShape{TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; - tensor_guid_t input_tensor = cg_builder.create_tensor(input_shape, CreateGrad::YES); + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; + tensor_guid_t input_tensor = + cg_builder.create_tensor(input_shape, CreateGrad::YES); float scalar = 4.0; - tensor_guid_t logit_tensor = cg_builder.scalar_multiply(input_tensor, scalar); + tensor_guid_t logit_tensor = + cg_builder.scalar_multiply(input_tensor, scalar); // allocate memory Allocator allocator = create_local_cuda_memory_allocator(); TensorBackingMap tensor_backing_map; - GenericTensorAccessorW input_backing = allocator.allocate_tensor(input_shape); + GenericTensorAccessorW input_backing = + allocator.allocate_tensor(input_shape); tensor_backing_map.insert({input_tensor, input_backing}); SUBCASE("SparseCategoricalCrossEntropyLossAttrs") { - TensorShape label_shape = TensorShape{TensorDims{FFOrdered{batch_size, 1}}, DataType::FLOAT}; - tensor_guid_t label_tensor = cg_builder.create_tensor(label_shape, CreateGrad::NO); - GenericTensorAccessorW label_backing = allocator.allocate_tensor(label_shape); + TensorShape label_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, 1}}, DataType::FLOAT}; + tensor_guid_t label_tensor = + cg_builder.create_tensor(label_shape, CreateGrad::NO); + GenericTensorAccessorW label_backing = + allocator.allocate_tensor(label_shape); tensor_backing_map.insert({label_tensor, label_backing}); - ModelTrainingInstance model_training_instance = ModelTrainingInstance{LossAttrs{SparseCategoricalCrossEntropyLossAttrs{/*replace_labels=*/false}}, - label_tensor, logit_tensor}; - LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, model_training_instance); + ModelTrainingInstance model_training_instance = ModelTrainingInstance{ + LossAttrs{ + SparseCategoricalCrossEntropyLossAttrs{/*replace_labels=*/false}}, + label_tensor, + logit_tensor}; + LocalTrainingBacking local_backing(allocator, + cg_builder.computation_graph, + tensor_backing_map, + runtime_arg_config, + model_training_instance); local_backing.execute_init(); local_backing.execute_forward(); local_backing.execute_backward(); } SUBCASE("OtherAttrs") { - tensor_guid_t label_tensor = cg_builder.create_tensor(input_shape, CreateGrad::NO); - GenericTensorAccessorW label_backing = allocator.allocate_tensor(input_shape); + tensor_guid_t label_tensor = + cg_builder.create_tensor(input_shape, CreateGrad::NO); + GenericTensorAccessorW label_backing = + allocator.allocate_tensor(input_shape); tensor_backing_map.insert({label_tensor, label_backing}); SUBCASE("LossFunction::CATEGORICAL_CROSSENTROPY") { - ModelTrainingInstance model_training_instance = ModelTrainingInstance{LossAttrs{OtherLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}, - label_tensor, logit_tensor}; - LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, model_training_instance); + ModelTrainingInstance model_training_instance = ModelTrainingInstance{ + LossAttrs{OtherLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}, + label_tensor, + logit_tensor}; + LocalTrainingBacking local_backing(allocator, + cg_builder.computation_graph, + tensor_backing_map, + runtime_arg_config, + model_training_instance); local_backing.execute_init(); local_backing.execute_forward(); local_backing.execute_backward(); } SUBCASE("LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE") { - ModelTrainingInstance model_training_instance = ModelTrainingInstance{LossAttrs{OtherLossAttrs{LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, - label_tensor, logit_tensor}; - LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, model_training_instance); + ModelTrainingInstance model_training_instance = ModelTrainingInstance{ + LossAttrs{ + OtherLossAttrs{LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, + label_tensor, + logit_tensor}; + LocalTrainingBacking local_backing(allocator, + cg_builder.computation_graph, + tensor_backing_map, + runtime_arg_config, + model_training_instance); local_backing.execute_init(); local_backing.execute_forward(); local_backing.execute_backward(); } SUBCASE("LossFunction::IDENTITY") { - ModelTrainingInstance model_training_instance = ModelTrainingInstance{LossAttrs{OtherLossAttrs{LossFunction::IDENTITY}}, - label_tensor, logit_tensor}; - LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, model_training_instance); + ModelTrainingInstance model_training_instance = ModelTrainingInstance{ + LossAttrs{OtherLossAttrs{LossFunction::IDENTITY}}, + label_tensor, + logit_tensor}; + LocalTrainingBacking local_backing(allocator, + cg_builder.computation_graph, + tensor_backing_map, + runtime_arg_config, + model_training_instance); local_backing.execute_init(); local_backing.execute_forward(); local_backing.execute_backward(); } - } } } From f74711fb71685ef95c10770646e39fdf3acd27a0 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 27 Aug 2024 12:30:16 -0700 Subject: [PATCH 05/91] Refactor and build optimizer kernels, op --- lib/kernels/CMakeLists.txt | 1 + lib/kernels/include/kernels/array_shape.h | 3 + .../include/kernels/optimizer_kernels.h | 3 + lib/kernels/src/array_shape.cc | 8 + lib/kernels/src/cuda/optimizer_kernels.cu | 167 +++++++------- .../include/local-execution/loss_functions.h | 4 +- .../include/local-execution/optimizer.h | 22 ++ .../src/local_training_backing.cc | 9 +- lib/local-execution/src/optimizer.cc | 205 ++++++++++++++++++ lib/pcg/include/pcg/optimizer_attrs.h | 14 -- .../include/pcg/optimizer_attrs.variant.toml | 23 ++ .../adam_optimizer_attrs.struct.toml | 4 + 12 files changed, 370 insertions(+), 93 deletions(-) create mode 100644 lib/local-execution/include/local-execution/optimizer.h create mode 100644 lib/local-execution/src/optimizer.cc delete mode 100644 lib/pcg/include/pcg/optimizer_attrs.h create mode 100644 lib/pcg/include/pcg/optimizer_attrs.variant.toml diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt index baac58f8e3..5a6a0d1357 100644 --- a/lib/kernels/CMakeLists.txt +++ b/lib/kernels/CMakeLists.txt @@ -9,6 +9,7 @@ file(GLOB_RECURSE SRC src/*.cc src/cuda/cuda_helper.cu src/cuda/loss_function_kernels.cu + src/cuda/optimizer_kernels.cu src/cuda/ops/*.cu ) diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h index c95c447574..6b0b57b57f 100644 --- a/lib/kernels/include/kernels/array_shape.h +++ b/lib/kernels/include/kernels/array_shape.h @@ -50,6 +50,9 @@ struct ArrayShape { ArrayShape sub_shape(std::optional start, std::optional end) const; + bool operator==(ArrayShape const &) const; + bool operator!=(ArrayShape const &) const; + public: LegionTensorDims dims; }; diff --git a/lib/kernels/include/kernels/optimizer_kernels.h b/lib/kernels/include/kernels/optimizer_kernels.h index fcbf9454f8..ed7c2778dd 100644 --- a/lib/kernels/include/kernels/optimizer_kernels.h +++ b/lib/kernels/include/kernels/optimizer_kernels.h @@ -34,6 +34,8 @@ void adam_ps_update_task_gpu(ffStream_t, float beta2, float weight_decay, float epsilon, + size_t size, + int num_replicas, float const *weight_grad_ptr, float *adam_m_ptr, float *adam_v_ptr, @@ -45,6 +47,7 @@ void adam_nccl_update_task_gpu(ffStream_t, float beta2, float weight_decay, float epsilon, + size_t size, PerDeviceFFHandle const &, float const *weight_grad_ptr, float *adam_m_ptr, diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index 69f04d6d34..ddfa3964e3 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -101,6 +101,14 @@ TensorShape get_tensor_shape(ArrayShape const &shape, DataType dtype) { dtype}; } +bool ArrayShape::operator==(ArrayShape const & other) const { + return this->dims == other.dims; +} + +bool ArrayShape::operator!=(ArrayShape const & other) const { + return this->dims != other.dims; +} + std::string format_as(ArrayShape const &x) { std::ostringstream oss; oss << " <<>>( - (float *)w_grad_ptr, src, size, 1.0f); + (float *)weight_grad_ptr, src, size, 1.0f); } // checkCUDA(cudaDeviceSynchronize()); // Step 2: SGD update sgd_update<<>>( size, - op->lr, - op->weight_decay, - op->momentum, - op->nesterov, - w_grad_ptr, - v_ptr, - w_ptr); + lr, + weight_decay, + momentum, + nesterov, + weight_grad_ptr, + sgd_v_ptr, + weight_ptr); // checkCUDA(cudaDeviceSynchronize()); } #ifdef FF_USE_NCCL -__host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, - PerDeviceOpState const *meta, - float const *w_grad_ptr, - size_t size, - float *w_ptr, - float *v_ptr) { +void sgd_nccl_update_task_gpu(cudaStream_t stream, + float lr, + float momentum, + bool nesterov, + float weight_decay, + PerDeviceFFHandle const & handle, + float const *weight_grad_ptr, + size_t size, + float *weight_ptr, + float *sgd_v_ptr) { // Use NCCL to sync gradients // fprintf(stderr, "weight(%p) Before ncclAllReduce...\n", w_grad_ptr); - cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - checkNCCL(ncclAllReduce(w_grad_ptr, - (float *)w_grad_ptr, + checkNCCL(ncclAllReduce(weight_grad_ptr, + (float *)weight_grad_ptr, size, - ncclFloat, - ncclSum, - meta->handle.ncclComm, + ncclDataType_t::ncclFloat, + ncclRedOp_t::ncclSum, + handle.ncclComm, stream)); // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr); // print_tensor((float*)w_grad_ptr, 16, "[After ncclAllReduce]"); @@ -94,13 +101,13 @@ __host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op, // Step 2: SGD update sgd_update<<>>( size, - op->lr, - op->weight_decay, - op->momentum, - op->nesterov, - w_grad_ptr, - v_ptr, - w_ptr); + lr, + weight_decay, + momentum, + nesterov, + weight_grad_ptr, + sgd_v_ptr, + weight_ptr); // checkCUDA(cudaDeviceSynchronize()); } #endif @@ -145,20 +152,24 @@ __global__ void adam_update(int count, } } -__host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op, - float const *w_grad_ptr, - size_t size, - int num_replicas, - float *w_ptr, - float *v_ptr, - float *m_ptr) { - cudaStream_t stream; +void adam_ps_update_task_gpu(cudaStream_t stream, + float alpha_t, + float beta1, + float beta2, + float weight_decay, + float epsilon, + size_t size, + int num_replicas, + float const *weight_grad_ptr, + float *adam_m_ptr, + float *adam_v_ptr, + float *weight_ptr) { checkCUDA(get_legion_stream(&stream)); // Step 1: Gather gradients in the first replica for (int i = 1; i < num_replicas; i++) { - float const *src = w_grad_ptr + i * size; + float const *src = weight_grad_ptr + i * size; add_kernel<<>>( - size, 1.0f, src, (float *)w_grad_ptr); + size, 1.0f, src, (float *)weight_grad_ptr); } // checkCUDA(cudaDeviceSynchronize()); // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n", @@ -166,50 +177,54 @@ __host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op, // Step 2: Adam update adam_update<<>>( size, - op->alpha_t, - op->beta1, - op->beta2, - op->weight_decay, - op->epsilon, - w_grad_ptr, - m_ptr, - v_ptr, - w_ptr); + alpha_t, + beta1, + beta2, + weight_decay, + epsilon, + weight_grad_ptr, + adam_m_ptr, + adam_v_ptr, + weight_ptr); // checkCUDA(cudaDeviceSynchronize()); } #ifdef FF_USE_NCCL -__host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op, - PerDeviceOpState const *meta, - float const *w_grad_ptr, - size_t size, - float *w_ptr, - float *v_ptr, - float *m_ptr) { +void adam_nccl_update_task_gpu(cudaStream_t stream, + float alpha_t, + float beta1, + float beta2, + float weight_decay, + float epsilon, + size_t size, + PerDeviceFFHandle const & handle, + float const *weight_grad_ptr, + float *adam_m_ptr, + float *adam_v_ptr, + float *weight_ptr) { // Use NCCL to sync gradients - cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); - checkNCCL(ncclAllReduce(w_grad_ptr, - (float *)w_grad_ptr, + checkNCCL(ncclAllReduce(weight_grad_ptr, + (float *)weight_grad_ptr, size, - ncclFloat, - ncclSum, - meta->handle.ncclComm, + ncclDataType_t::ncclFloat, + ncclRedOp_t::ncclSum, + handle.ncclComm, stream)); // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n", // op->alpha, op->alpha_t, op->weight_decay); // Step 2: Adam update adam_update<<>>( size, - op->alpha_t, - op->beta1, - op->beta2, - op->weight_decay, - op->epsilon, - w_grad_ptr, - m_ptr, - v_ptr, - w_ptr); + alpha_t, + beta1, + beta2, + weight_decay, + epsilon, + weight_grad_ptr, + adam_m_ptr, + adam_v_ptr, + weight_ptr); // checkCUDA(cudaDeviceSynchronize()); } #endif diff --git a/lib/local-execution/include/local-execution/loss_functions.h b/lib/local-execution/include/local-execution/loss_functions.h index e5e81b60a7..58405536d8 100644 --- a/lib/local-execution/include/local-execution/loss_functions.h +++ b/lib/local-execution/include/local-execution/loss_functions.h @@ -13,8 +13,8 @@ * limitations under the License. */ -#ifndef _FLEXFLOW_LOSS_FUNCTIONS_H_ -#define _FLEXFLOW_LOSS_FUNCTIONS_H_ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_ +#define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_ #include "local-execution/task_impl_function.dtg.h" #include "local-execution/task_invocation.h" diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h new file mode 100644 index 0000000000..4702352568 --- /dev/null +++ b/lib/local-execution/include/local-execution/optimizer.h @@ -0,0 +1,22 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_ +#define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_ + +#include "local-execution/task_impl_function.dtg.h" +#include "local-execution/task_invocation.h" +#include "local-execution/task_signature.h" +#include "pcg/optimizers/sgd_optimizer_attrs.dtg.h" +#include "pcg/optimizers/adam_optimizer_attrs.dtg.h" + +namespace FlexFlow { + +TaskSignature get_sgd_update_signature(); +TaskInvocation sgd_update(SGDOptimizerAttrs const &); +TaskImplFunction get_sgd_update_task_impl(); + +TaskSignature get_adam_update_signature(); +TaskInvocation adam_update(SGDOptimizerAttrs const &); +TaskImplFunction get_adam_update_task_impl(); + +} // namespace FlexFlow + +#endif diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index 98bfe7683e..c8f5f279d2 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -4,6 +4,7 @@ #include "utils/containers/contains.h" #include "utils/containers/contains_key.h" #include "utils/containers/reversed.h" +#include "utils/containers/get_only.h" #include "utils/exception.h" namespace FlexFlow { @@ -136,7 +137,13 @@ PerLayerElapsedTime LocalTrainingBacking::execute_backward() { } void LocalTrainingBacking::execute_update() { - NOT_IMPLEMENTED(); + for (layer_guid_t const &node: topological_ordering(this->computation_graph)) { + LayerAttrs layer_attrs = get_layer_attrs(this->computation_graph, node); + if (layer_attrs.attrs.has()) { + tensor_guid_t weight_tensor = get_only(get_outgoing_tensors(this->computation_graph, node)); + // TODO: handle momentum vectors separately? handle different updates? + } + } } TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor( diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc new file mode 100644 index 0000000000..2f45802978 --- /dev/null +++ b/lib/local-execution/src/optimizer.cc @@ -0,0 +1,205 @@ +#include "kernels/optimizer_kernels.h" +#include "local-execution/optimizer.h" +#include "local-execution/profiling.h" + +namespace FlexFlow { + +enum Slots { + ATTRS, + WEIGHT, + SGD_V, + PROFILING, + ADAM_M, + ADAM_V, + HANDLE +}; + +TaskSignature get_sgd_update_signature() { + TaskSignature sig = make_empty_task_signature(); + add_slot(sig, WEIGHT, IsGrad::YES); + add_slot(sig, WEIGHT, IsGrad::NO); + add_slot(sig, SGD_V, IsGrad::YES); + add_arg_slot(sig, ATTRS); + add_arg_slot(sig, PROFILING); + if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { + add_unchecked_arg_slot(sig, HANDLE); + } + return sig; +} + +TaskInvocation sgd_update(SGDOptimizerAttrs const & attrs, + tensor_guid_t const & weight, + tensor_guid_t const & sgd_v) { + TaskBinding b; + b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::YES}); + b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::NO}); + if (attrs.momentum > 0.0f) { + b.bind(SGD_V, TensorGuidSpec{sgd_v, IsGrad::YES}); + } + b.bind_arg(ATTRS, attrs); + b.bind_arg(PROFILING, profiling_settings()); + + if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { + b.bind_arg(HANDLE, ff_handle()); + return {task_id_t::SGD_UPD_NCCL_TASK_ID, b}; + } + return {task_id_t::SGD_UPD_PS_TASK_ID, b}; +} + +static void sgd_update_task_impl(TaskArgumentAccessor const & acc) { + auto attrs = acc.get_argument(ATTRS); + auto weight_grad = acc.get_tensor_grad(WEIGHT); + auto weight = acc.get_tensor(WEIGHT); + auto profiling = acc.get_argument(PROFILING); + + assert (weight.shape == weight_grad.shape); + size_t size = weight_grad.shape.get_volume(); + + assert (weight_grad.shape.get_volume() & weight.shape.get_volume() == 0); + size_t num_replicas = weight_grad.shape.get_volume() / weight.shape.get_volume(); + + float *sgd_v_ptr; + if (attrs.momentum > 0.0f) { + auto sgd_v = acc.get_tensor(SGD_V); + assert (sgd_v.shape == weight.shape); + sgd_v_ptr = sgd_v.get_float_ptr(); + } + + if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { + auto handle = acc.get_argument(HANDLE); + profile(sgd_nccl_update_task_gpu, + profiling, + "[SGD NCCL] update_time = %.2lfms\n", + attrs.lr, + attrs.momentum, + attrs.nesterov, + attrs.weight_decay, + handle, + weight_grad.get_float_ptr(), + size, + weight.get_float_ptr(), + sgd_v_ptr); + + } else { + profile(sgd_ps_update_task_gpu, + profiling, + "[SGD PS] update_time = %.2lfms\n", + attrs.lr, + attrs.momentum, + attrs.nesterov, + attrs.weight_decay, + weight_grad.get_float_ptr(), + size, + num_replicas, + weight.get_float_ptr(), + sgd_v_ptr); + } +} + +TaskImplFunction get_sgd_update_task_impl() { + return TaskImplFunction{GenericTaskImplFunction{sgd_update_task_impl}}; +} + +TaskSignature get_adam_update_signature() { + TaskSignature sig = make_empty_task_signature(); + add_slot(sig, WEIGHT, IsGrad::YES); + add_slot(sig, WEIGHT, IsGrad::NO); + add_slot(sig, ADAM_V, IsGrad::YES); + add_slot(sig, ADAM_M, IsGrad::YES); + add_arg_slot(sig, ATTRS); + add_arg_slot(sig, PROFILING); + if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { + add_unchecked_arg_slot(sig, HANDLE); + } + return sig; +} + +TaskInvocation adam_update(AdamOptimizerAttrs const & attrs, + tensor_guid_t const & weight, + tensor_guid_t const & adam_v, + tensor_guid_t const & adam_m) { + TaskBinding b; + b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::YES}); + b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::NO}); + b.bind(ADAM_M, TensorGuidSpec{adam_m, IsGrad::YES}); + b.bind(ADAM_V, TensorGuidSpec{adam_v, IsGrad::YES}); + b.bind_arg(ATTRS, attrs); + b.bind_arg(PROFILING, profiling_settings()); + + if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { + b.bind_arg(HANDLE, ff_handle()); + return {task_id_t::ADAM_UPD_NCCL_TASK_ID, b}; + } + return {task_id_t::ADAM_UPD_PS_TASK_ID, b}; +} + +static void adam_update_task_impl(TaskArgumentAccessor const & acc) { + auto attrs = acc.get_argument(ATTRS); + auto weight_grad = acc.get_tensor_grad(WEIGHT); + auto weight = acc.get_tensor(WEIGHT); + auto v_tensor = acc.get_tensor(ADAM_V); + auto m_tensor = acc.get_tensor(ADAM_M); + + auto profiling = acc.get_argument(PROFILING); + + assert (weight.shape == weight_grad.shape); + size_t size = weight_grad.shape.get_volume(); + + assert (weight_grad.shape.get_volume() % weight.shape.get_volume() == 0); + size_t num_replicas = weight_grad.shape.get_volume() / weight.shape.get_volume(); + + if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { + auto handle = acc.get_argument(HANDLE); + profile(adam_nccl_update_task_gpu, + profiling, + "[Adam NCCL] update_time = %.2lfms\n", + attrs.alpha_t, + attrs.beta1, + attrs.beta2, + attrs.weight_decay, + attrs.epsilon, + size, + handle, + weight_grad.get_float_ptr(), + m_tensor.get_float_ptr(), + v_tensor.get_float_ptr(), + weight.get_float_ptr()); + } else { + profile(adam_ps_update_task_gpu, + profiling, + "[Adam NCCL] update_time = %.2lfms\n", + attrs.alpha_t, + attrs.beta1, + attrs.beta2, + attrs.weight_decay, + attrs.epsilon, + size, + num_replicas, + weight_grad.get_float_ptr(), + m_tensor.get_float_ptr(), + v_tensor.get_float_ptr(), + weight.get_float_ptr()); + } +} + +AdamOptimizerAttrs next(AdamOptimizerAttrs const & old) { + double new_beta1_t = old.beta_t * old.beta1; + double new_beta2_t = old.beta2_t * old.beta2; + double new_alpha_t = old.alpha * sqrt(1 - new_beta2_t) / (1 - new_beta1_t); + return AdamOptimizerAttrs{ + old.alpha, + old.beta1, + old.beta2, + old.weight_decay, + new_alpha_t, + new_beta1_t, + new_beta2_t, + old.epsilon + }; +} + +TaskImplFunction get_adam_update_task_impl() { + return TaskImplFunction{GenericTaskImplFunction{adam_update_task_impl}}; +} + +} diff --git a/lib/pcg/include/pcg/optimizer_attrs.h b/lib/pcg/include/pcg/optimizer_attrs.h deleted file mode 100644 index 4bac74b999..0000000000 --- a/lib/pcg/include/pcg/optimizer_attrs.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef _FLEXFLOW_PCG_INCLUDE_PCG_OPTIMIZER_H -#define _FLEXFLOW_PCG_INCLUDE_PCG_OPTIMIZER_H - -#include "pcg/optimizers/adam_optimizer_attrs.h" -#include "pcg/optimizers/sgd_optimizer_attrs.h" -#include "utils/variant.h" - -namespace FlexFlow { - -using OptimizerAttrs = std::variant; - -} // namespace FlexFlow - -#endif diff --git a/lib/pcg/include/pcg/optimizer_attrs.variant.toml b/lib/pcg/include/pcg/optimizer_attrs.variant.toml new file mode 100644 index 0000000000..585c150700 --- /dev/null +++ b/lib/pcg/include/pcg/optimizer_attrs.variant.toml @@ -0,0 +1,23 @@ +namespace = "FlexFlow" +name = "OptimizerAttrs" +features = [ + "eq", + "ord", + "hash", + "json", + "fmt", + "rapidcheck", +] + +includes = [ + "pcg/optimizers/sgd_optimizer_attrs.dtg.h", + "pcg/optimizers/adam_optimizer_attrs.dtg.h", +] + +[[values]] +type = "::FlexFlow::SGDOptimizerAttrs" +key = "sgd_optimizer" + +[[values]] +type = "::FlexFlow::AdamOptimizerAttrs" +key = "adam_optimizer" diff --git a/lib/pcg/include/pcg/optimizers/adam_optimizer_attrs.struct.toml b/lib/pcg/include/pcg/optimizers/adam_optimizer_attrs.struct.toml index fd3e83cc4a..c25baa6c89 100644 --- a/lib/pcg/include/pcg/optimizers/adam_optimizer_attrs.struct.toml +++ b/lib/pcg/include/pcg/optimizers/adam_optimizer_attrs.struct.toml @@ -36,3 +36,7 @@ type = "double" [[fields]] name = "beta2_t" type = "double" + +[[fields]] +name = "epsilon" +type = "double" From 40c62526336ffbbee069988126047dcdad64a1ce Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 27 Aug 2024 13:40:10 -0700 Subject: [PATCH 06/91] Finish optimizer local backing --- lib/kernels/src/array_shape.cc | 4 +- lib/kernels/src/cuda/optimizer_kernels.cu | 40 +++-- .../local-execution/local_slots_backing.h | 11 +- .../local-execution/local_training_backing.h | 3 +- .../local-execution/model_training_instance.h | 13 ++ .../model_training_instance.struct.toml | 5 + .../include/local-execution/optimizer.h | 18 ++- .../src/local_cost_estimator.cc | 3 +- .../src/local_slots_backing.cc | 21 +++ .../src/local_training_backing.cc | 49 ++++++- .../src/model_training_instance.cc | 26 ++++ lib/local-execution/src/optimizer.cc | 137 ++++++++++-------- .../test/src/test_loss_function.cc | 47 +++--- lib/pcg/include/pcg/computation_graph.h | 4 + lib/pcg/include/pcg/optimizer_attrs.h | 13 ++ lib/pcg/src/pcg/computation_graph.cc | 13 ++ lib/pcg/src/pcg/optimizer_attrs.cc | 14 ++ 17 files changed, 300 insertions(+), 121 deletions(-) create mode 100644 lib/local-execution/include/local-execution/model_training_instance.h create mode 100644 lib/local-execution/src/model_training_instance.cc create mode 100644 lib/pcg/include/pcg/optimizer_attrs.h create mode 100644 lib/pcg/src/pcg/optimizer_attrs.cc diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index ddfa3964e3..054e16e90a 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -101,11 +101,11 @@ TensorShape get_tensor_shape(ArrayShape const &shape, DataType dtype) { dtype}; } -bool ArrayShape::operator==(ArrayShape const & other) const { +bool ArrayShape::operator==(ArrayShape const &other) const { return this->dims == other.dims; } -bool ArrayShape::operator!=(ArrayShape const & other) const { +bool ArrayShape::operator!=(ArrayShape const &other) const { return this->dims != other.dims; } diff --git a/lib/kernels/src/cuda/optimizer_kernels.cu b/lib/kernels/src/cuda/optimizer_kernels.cu index 7d1d720ba0..2eaf30b21f 100644 --- a/lib/kernels/src/cuda/optimizer_kernels.cu +++ b/lib/kernels/src/cuda/optimizer_kernels.cu @@ -14,8 +14,8 @@ */ #include "device.h" -#include "kernels/optimizer_kernels.h" #include "kernels/nccl.h" +#include "kernels/optimizer_kernels.h" namespace FlexFlow { @@ -62,15 +62,14 @@ void sgd_ps_update_task_gpu(cudaStream_t stream, } // checkCUDA(cudaDeviceSynchronize()); // Step 2: SGD update - sgd_update<<>>( - size, - lr, - weight_decay, - momentum, - nesterov, - weight_grad_ptr, - sgd_v_ptr, - weight_ptr); + sgd_update<<>>(size, + lr, + weight_decay, + momentum, + nesterov, + weight_grad_ptr, + sgd_v_ptr, + weight_ptr); // checkCUDA(cudaDeviceSynchronize()); } @@ -80,7 +79,7 @@ void sgd_nccl_update_task_gpu(cudaStream_t stream, float momentum, bool nesterov, float weight_decay, - PerDeviceFFHandle const & handle, + PerDeviceFFHandle const &handle, float const *weight_grad_ptr, size_t size, float *weight_ptr, @@ -99,15 +98,14 @@ void sgd_nccl_update_task_gpu(cudaStream_t stream, // print_tensor((float*)w_grad_ptr, 16, "[After ncclAllReduce]"); // Step 2: SGD update - sgd_update<<>>( - size, - lr, - weight_decay, - momentum, - nesterov, - weight_grad_ptr, - sgd_v_ptr, - weight_ptr); + sgd_update<<>>(size, + lr, + weight_decay, + momentum, + nesterov, + weight_grad_ptr, + sgd_v_ptr, + weight_ptr); // checkCUDA(cudaDeviceSynchronize()); } #endif @@ -197,7 +195,7 @@ void adam_nccl_update_task_gpu(cudaStream_t stream, float weight_decay, float epsilon, size_t size, - PerDeviceFFHandle const & handle, + PerDeviceFFHandle const &handle, float const *weight_grad_ptr, float *adam_m_ptr, float *adam_v_ptr, diff --git a/lib/local-execution/include/local-execution/local_slots_backing.h b/lib/local-execution/include/local-execution/local_slots_backing.h index 1f35bdd304..439113c873 100644 --- a/lib/local-execution/include/local-execution/local_slots_backing.h +++ b/lib/local-execution/include/local-execution/local_slots_backing.h @@ -1,6 +1,6 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_SLOT_REGISTRY_H -#define _FLEXFLOW_LOCAL_EXECUTION_SLOT_REGISTRY_H +#ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_SLOTS_BACKING_H +#define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_SLOTS_BACKING_H #include "kernels/accessor.h" #include "local-execution/local_task_argument_accessor.h" @@ -23,6 +23,11 @@ struct LocalSlotsBacking { void allocate_outgoing_tensors(layer_guid_t const &, ComputationGraph const &, Allocator &); + void allocate_optimizer_tensors(layer_guid_t const &weight_layer, + tensor_guid_t const &, + ComputationGraph const &, + Allocator &, + TaskSignature const &); TensorSlotsBacking construct_tensor_slots_backing(OpTaskBinding const &, layer_guid_t const &) const; TensorSlotsBacking construct_tensor_slots_backing(TaskBinding const &) const; @@ -48,6 +53,8 @@ struct LocalSlotsBacking { input_tensor_slots; std::unordered_map> output_tensor_slots; + std::unordered_map> + weight_optimizer_tensor_guids; // arguments std::unordered_map diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h index 55983086c2..d2586038f0 100644 --- a/lib/local-execution/include/local-execution/local_training_backing.h +++ b/lib/local-execution/include/local-execution/local_training_backing.h @@ -4,7 +4,6 @@ #include "local-execution/local_slots_backing.h" #include "local-execution/model_training_instance.dtg.h" #include "local-execution/task_registry.h" -#include "op-attrs/ops/loss_functions.h" namespace FlexFlow { @@ -16,7 +15,7 @@ struct LocalTrainingBacking { ComputationGraph const &, TensorBackingMap const &, RuntimeArgConfig const &, - std::optional const &); + std::optional &); void execute_init(); PerLayerElapsedTime execute_forward(); diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h new file mode 100644 index 0000000000..7ea027a636 --- /dev/null +++ b/lib/local-execution/include/local-execution/model_training_instance.h @@ -0,0 +1,13 @@ + +#ifndef _FLEXFLOW_LOCAL_EXECUTION_MODEL_TRAINING_INSTANCE_H +#define _FLEXFLOW_LOCAL_EXECUTION_MODEL_TRAINING_INSTANCE_H + +#include "local-execution/model_training_instance.dtg.h" + +namespace FlexFlow { + +ModelTrainingInstance next(ModelTrainingInstance const & old); + +} // namespace FlexFlow + +#endif diff --git a/lib/local-execution/include/local-execution/model_training_instance.struct.toml b/lib/local-execution/include/local-execution/model_training_instance.struct.toml index ea7e8d24ab..e3ff397e39 100644 --- a/lib/local-execution/include/local-execution/model_training_instance.struct.toml +++ b/lib/local-execution/include/local-execution/model_training_instance.struct.toml @@ -11,6 +11,7 @@ includes = [ "utils/optional.h", "op-attrs/ops/loss_attrs.dtg.h", "pcg/tensor_guid_t.dtg.h", + "pcg/optimizer_attrs.dtg.h", ] [[fields]] @@ -24,3 +25,7 @@ type = "::FlexFlow::tensor_guid_t" [[fields]] name = "logit_tensor" type = "::FlexFlow::tensor_guid_t" + +[[fields]] +name = "optimizer_attrs" +type = "::FlexFlow::OptimizerAttrs" diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h index 4702352568..53dcad63de 100644 --- a/lib/local-execution/include/local-execution/optimizer.h +++ b/lib/local-execution/include/local-execution/optimizer.h @@ -4,17 +4,29 @@ #include "local-execution/task_impl_function.dtg.h" #include "local-execution/task_invocation.h" #include "local-execution/task_signature.h" -#include "pcg/optimizers/sgd_optimizer_attrs.dtg.h" +#include "pcg/optimizer_attrs.dtg.h" #include "pcg/optimizers/adam_optimizer_attrs.dtg.h" +#include "pcg/optimizers/sgd_optimizer_attrs.dtg.h" namespace FlexFlow { +TaskSignature get_update_signature(OptimizerAttrs const &); +TaskInvocation get_update_invocation(OptimizerAttrs const &, + tensor_guid_t const &weight, + std::vector const &); +TaskImplFunction get_update_task_impl(OptimizerAttrs const &); + TaskSignature get_sgd_update_signature(); -TaskInvocation sgd_update(SGDOptimizerAttrs const &); +TaskInvocation sgd_update(SGDOptimizerAttrs const &, + tensor_guid_t const &weight, + tensor_guid_t const &); TaskImplFunction get_sgd_update_task_impl(); TaskSignature get_adam_update_signature(); -TaskInvocation adam_update(SGDOptimizerAttrs const &); +TaskInvocation adam_update(AdamOptimizerAttrs const &, + tensor_guid_t const &weight, + tensor_guid_t const &, + tensor_guid_t const &); TaskImplFunction get_adam_update_task_impl(); } // namespace FlexFlow diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc index 1ca422d8e1..a39d55adff 100644 --- a/lib/local-execution/src/local_cost_estimator.cc +++ b/lib/local-execution/src/local_cost_estimator.cc @@ -72,11 +72,12 @@ CostDetails LocalCostEstimator::estimate_cost( get_vector_piece_attrs(weights), get_vector_piece_attrs(outputs)); + std::optional model_training_instance = std::nullopt; LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, this->runtime_arg_config, - std::nullopt); + model_training_instance); local_backing.execute_init(); PerLayerElapsedTime fwd = local_backing.execute_forward(); diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc index 787c7dda86..5059f29abd 100644 --- a/lib/local-execution/src/local_slots_backing.cc +++ b/lib/local-execution/src/local_slots_backing.cc @@ -47,6 +47,27 @@ void LocalSlotsBacking::allocate_outgoing_tensors( this->output_tensor_slots.insert({layer_guid, outgoing_tensors}); } +void LocalSlotsBacking::allocate_optimizer_tensors( + layer_guid_t const &weight_layer, + tensor_guid_t const &weight, + ComputationGraph const &cg, + Allocator &allocator, + TaskSignature const &sig) { + GenericTensorAccessorW weight_backing = + get_tensor_backing(weight, IsGrad::NO); + int num_buffer_tensors = + sig.tensor_guid_slots.size() - 2; // ignore 2 (weight and weight_grad) + std::vector buffer_tensors = + get_new_tensor_guids_for_layer_without_graph_insertion( + cg, weight_layer, num_buffer_tensors); + for (auto const &tensor_guid : buffer_tensors) { + GenericTensorAccessorW buffer_backing = allocator.allocate_tensor( + get_tensor_shape(weight_backing.shape, weight_backing.data_type)); + this->gradient_tensor_mapping.insert({tensor_guid, buffer_backing}); + } + this->weight_optimizer_tensor_guids.insert({weight, buffer_tensors}); +} + bool LocalSlotsBacking::is_tensor_allocated( tensor_guid_t const &tensor_id) const { return contains_key(this->tensor_mapping, tensor_id); diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index c8f5f279d2..eb49f16df1 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -1,10 +1,12 @@ #include "local-execution/local_training_backing.h" #include "local-execution/loss_functions.h" +#include "local-execution/model_training_instance.h" +#include "local-execution/optimizer.h" #include "local-execution/task_signature_impl.h" #include "utils/containers/contains.h" #include "utils/containers/contains_key.h" -#include "utils/containers/reversed.h" #include "utils/containers/get_only.h" +#include "utils/containers/reversed.h" #include "utils/exception.h" namespace FlexFlow { @@ -14,22 +16,33 @@ LocalTrainingBacking::LocalTrainingBacking( ComputationGraph const &computation_graph, TensorBackingMap const &tensor_backing_mapping, RuntimeArgConfig const &runtime_arg_config, - std::optional const &training_instance) + std::optional &training_instance) : allocator(allocator), computation_graph(computation_graph), local_slots_backing(tensor_backing_mapping, runtime_arg_config), task_registry(empty_task_registry()), training_instance(training_instance) { - for (layer_guid_t const &node : topological_ordering(computation_graph)) { + for (layer_guid_t const &node : + topological_ordering(this->computation_graph)) { ComputationGraphOpAttrs attrs = - get_layer_attrs(computation_graph, node).attrs; + get_layer_attrs(this->computation_graph, node).attrs; // allocate outgoing tensors this->local_slots_backing.allocate_outgoing_tensors( - node, computation_graph, this->allocator); + node, this->computation_graph, this->allocator); // register tasks register_tasks_for_layer(this->task_registry, node, attrs); + + // allocate optimizer buffers + if (attrs.has() && this->training_instance.has_value()) { + OptimizerAttrs attrs = this->training_instance.value().optimizer_attrs; + TaskSignature sig = get_update_signature(attrs); + tensor_guid_t weight_tensor = + get_only(get_outgoing_tensors(this->computation_graph, node)); + this->local_slots_backing.allocate_optimizer_tensors( + node, weight_tensor, this->computation_graph, this->allocator, sig); + } } if (this->training_instance.has_value()) { @@ -137,13 +150,33 @@ PerLayerElapsedTime LocalTrainingBacking::execute_backward() { } void LocalTrainingBacking::execute_update() { - for (layer_guid_t const &node: topological_ordering(this->computation_graph)) { + assert(this->training_instance.has_value()); + OptimizerAttrs attrs = this->training_instance.value().optimizer_attrs; + + for (layer_guid_t const &node : + topological_ordering(this->computation_graph)) { LayerAttrs layer_attrs = get_layer_attrs(this->computation_graph, node); if (layer_attrs.attrs.has()) { - tensor_guid_t weight_tensor = get_only(get_outgoing_tensors(this->computation_graph, node)); - // TODO: handle momentum vectors separately? handle different updates? + // get tensors + tensor_guid_t weight_tensor = + get_only(get_outgoing_tensors(this->computation_graph, node)); + std::vector buffer_tensors = + this->local_slots_backing.weight_optimizer_tensor_guids.at( + weight_tensor); + + // get invocation + TaskInvocation invocation = + get_update_invocation(attrs, weight_tensor, buffer_tensors); + assert(is_invocation_valid(get_update_signature(attrs), invocation)); + + // execute update + TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation); + TaskImplFunction update_impl_fn = get_update_task_impl(attrs); + update_impl_fn.get().function_ptr(accessor); } } + + this->training_instance = next(this->training_instance.value()); } TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor( diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc new file mode 100644 index 0000000000..646e3ac588 --- /dev/null +++ b/lib/local-execution/src/model_training_instance.cc @@ -0,0 +1,26 @@ +#include "local-execution/model_training_instance.h" + +namespace FlexFlow { + +ModelTrainingInstance next(ModelTrainingInstance const & old_training_instance) { + if (old_training_instance.optimizer_attrs.has()) { + AdamOptimizerAttrs old = old_training_instance.optimizer_attrs.get(); + double new_beta1_t = old.beta_t * old.beta1; + double new_beta2_t = old.beta2_t * old.beta2; + double new_alpha_t = old.alpha * sqrt(1 - new_beta2_t) / (1 - new_beta1_t); + OptimizerAttrs new_attrs = OptimizerAttrs{AdamOptimizerAttrs{ + old.alpha, + old.beta1, + old.beta2, + old.weight_decay, + new_alpha_t, + new_beta1_t, + new_beta2_t, + old.epsilon + }}; + return ModelTrainingInstance{old_training_instance.loss_attrs, old_training_instance.label_tensor, old_training_instance.logit_tensor, new_attrs}; + } + return old_training_instance; +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc index 2f45802978..1b1173c70e 100644 --- a/lib/local-execution/src/optimizer.cc +++ b/lib/local-execution/src/optimizer.cc @@ -1,18 +1,11 @@ -#include "kernels/optimizer_kernels.h" #include "local-execution/optimizer.h" +#include "kernels/optimizer_kernels.h" #include "local-execution/profiling.h" +#include "utils/overload.h" namespace FlexFlow { -enum Slots { - ATTRS, - WEIGHT, - SGD_V, - PROFILING, - ADAM_M, - ADAM_V, - HANDLE -}; +enum Slots { ATTRS, WEIGHT, SGD_V, PROFILING, ADAM_M, ADAM_V, HANDLE }; TaskSignature get_sgd_update_signature() { TaskSignature sig = make_empty_task_signature(); @@ -27,9 +20,9 @@ TaskSignature get_sgd_update_signature() { return sig; } -TaskInvocation sgd_update(SGDOptimizerAttrs const & attrs, - tensor_guid_t const & weight, - tensor_guid_t const & sgd_v) { +TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs, + tensor_guid_t const &weight, + tensor_guid_t const &sgd_v) { TaskBinding b; b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::YES}); b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::NO}); @@ -46,53 +39,54 @@ TaskInvocation sgd_update(SGDOptimizerAttrs const & attrs, return {task_id_t::SGD_UPD_PS_TASK_ID, b}; } -static void sgd_update_task_impl(TaskArgumentAccessor const & acc) { +static void sgd_update_task_impl(TaskArgumentAccessor const &acc) { auto attrs = acc.get_argument(ATTRS); auto weight_grad = acc.get_tensor_grad(WEIGHT); auto weight = acc.get_tensor(WEIGHT); auto profiling = acc.get_argument(PROFILING); - assert (weight.shape == weight_grad.shape); + assert(weight.shape == weight_grad.shape); size_t size = weight_grad.shape.get_volume(); - assert (weight_grad.shape.get_volume() & weight.shape.get_volume() == 0); - size_t num_replicas = weight_grad.shape.get_volume() / weight.shape.get_volume(); + assert(weight_grad.shape.get_volume() & weight.shape.get_volume() == 0); + size_t num_replicas = + weight_grad.shape.get_volume() / weight.shape.get_volume(); float *sgd_v_ptr; if (attrs.momentum > 0.0f) { auto sgd_v = acc.get_tensor(SGD_V); - assert (sgd_v.shape == weight.shape); + assert(sgd_v.shape == weight.shape); sgd_v_ptr = sgd_v.get_float_ptr(); } if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { auto handle = acc.get_argument(HANDLE); profile(sgd_nccl_update_task_gpu, - profiling, - "[SGD NCCL] update_time = %.2lfms\n", - attrs.lr, - attrs.momentum, - attrs.nesterov, - attrs.weight_decay, - handle, - weight_grad.get_float_ptr(), - size, - weight.get_float_ptr(), - sgd_v_ptr); + profiling, + "[SGD NCCL] update_time = %.2lfms\n", + attrs.lr, + attrs.momentum, + attrs.nesterov, + attrs.weight_decay, + handle, + weight_grad.get_float_ptr(), + size, + weight.get_float_ptr(), + sgd_v_ptr); } else { profile(sgd_ps_update_task_gpu, - profiling, - "[SGD PS] update_time = %.2lfms\n", - attrs.lr, - attrs.momentum, - attrs.nesterov, - attrs.weight_decay, - weight_grad.get_float_ptr(), - size, - num_replicas, - weight.get_float_ptr(), - sgd_v_ptr); + profiling, + "[SGD PS] update_time = %.2lfms\n", + attrs.lr, + attrs.momentum, + attrs.nesterov, + attrs.weight_decay, + weight_grad.get_float_ptr(), + size, + num_replicas, + weight.get_float_ptr(), + sgd_v_ptr); } } @@ -114,10 +108,10 @@ TaskSignature get_adam_update_signature() { return sig; } -TaskInvocation adam_update(AdamOptimizerAttrs const & attrs, - tensor_guid_t const & weight, - tensor_guid_t const & adam_v, - tensor_guid_t const & adam_m) { +TaskInvocation adam_update(AdamOptimizerAttrs const &attrs, + tensor_guid_t const &weight, + tensor_guid_t const &adam_v, + tensor_guid_t const &adam_m) { TaskBinding b; b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::YES}); b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::NO}); @@ -133,7 +127,7 @@ TaskInvocation adam_update(AdamOptimizerAttrs const & attrs, return {task_id_t::ADAM_UPD_PS_TASK_ID, b}; } -static void adam_update_task_impl(TaskArgumentAccessor const & acc) { +static void adam_update_task_impl(TaskArgumentAccessor const &acc) { auto attrs = acc.get_argument(ATTRS); auto weight_grad = acc.get_tensor_grad(WEIGHT); auto weight = acc.get_tensor(WEIGHT); @@ -142,11 +136,12 @@ static void adam_update_task_impl(TaskArgumentAccessor const & acc) { auto profiling = acc.get_argument(PROFILING); - assert (weight.shape == weight_grad.shape); + assert(weight.shape == weight_grad.shape); size_t size = weight_grad.shape.get_volume(); - assert (weight_grad.shape.get_volume() % weight.shape.get_volume() == 0); - size_t num_replicas = weight_grad.shape.get_volume() / weight.shape.get_volume(); + assert(weight_grad.shape.get_volume() % weight.shape.get_volume() == 0); + size_t num_replicas = + weight_grad.shape.get_volume() / weight.shape.get_volume(); if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { auto handle = acc.get_argument(HANDLE); @@ -182,24 +177,38 @@ static void adam_update_task_impl(TaskArgumentAccessor const & acc) { } } -AdamOptimizerAttrs next(AdamOptimizerAttrs const & old) { - double new_beta1_t = old.beta_t * old.beta1; - double new_beta2_t = old.beta2_t * old.beta2; - double new_alpha_t = old.alpha * sqrt(1 - new_beta2_t) / (1 - new_beta1_t); - return AdamOptimizerAttrs{ - old.alpha, - old.beta1, - old.beta2, - old.weight_decay, - new_alpha_t, - new_beta1_t, - new_beta2_t, - old.epsilon - }; -} - TaskImplFunction get_adam_update_task_impl() { return TaskImplFunction{GenericTaskImplFunction{adam_update_task_impl}}; } +TaskSignature get_update_signature(OptimizerAttrs const &attrs) { + return attrs.visit(overload{ + [&](SGDOptimizerAttrs const &s) { return get_sgd_update_signature(); }, + [&](AdamOptimizerAttrs const &s) { + return get_adam_update_signature(); + }}); } + +TaskInvocation + get_update_invocation(OptimizerAttrs const &attrs, + tensor_guid_t const &weight, + std::vector const &buffer_tensors) { + return attrs.visit( + overload{[&](SGDOptimizerAttrs const &s) { + return sgd_update(s, weight, buffer_tensors.at(0)); + }, + [&](AdamOptimizerAttrs const &s) { + return adam_update( + s, weight, buffer_tensors.at(0), buffer_tensors.at(1)); + }}); +} + +TaskImplFunction get_update_task_impl(OptimizerAttrs const &attrs) { + return attrs.visit(overload{ + [&](SGDOptimizerAttrs const &s) { return get_sgd_update_task_impl(); }, + [&](AdamOptimizerAttrs const &s) { + return get_adam_update_task_impl(); + }}); +} + +} // namespace FlexFlow diff --git a/lib/local-execution/test/src/test_loss_function.cc b/lib/local-execution/test/src/test_loss_function.cc index 9e60c1b979..3d9946c89c 100644 --- a/lib/local-execution/test/src/test_loss_function.cc +++ b/lib/local-execution/test/src/test_loss_function.cc @@ -4,6 +4,7 @@ #include "kernels/managed_per_device_ff_handle.h" #include "local-execution/local_training_backing.h" #include "pcg/computation_graph_builder.h" +#include "pcg/optimizer_attrs.h" #include "test_utils.h" namespace FlexFlow { @@ -18,6 +19,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { EnableProfiling::NO, ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/0}}; + OptimizerAttrs optimizer_attrs = make_empty_sgd_attrs(); + // construct graph ComputationGraphBuilder cg_builder; @@ -47,11 +50,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { GenericTensorAccessorW label_backing = allocator.allocate_tensor(label_shape); tensor_backing_map.insert({label_tensor, label_backing}); - ModelTrainingInstance model_training_instance = ModelTrainingInstance{ - LossAttrs{ - SparseCategoricalCrossEntropyLossAttrs{/*replace_labels=*/false}}, - label_tensor, - logit_tensor}; + std::optional model_training_instance = + ModelTrainingInstance{ + LossAttrs{SparseCategoricalCrossEntropyLossAttrs{ + /*replace_labels=*/false}}, + label_tensor, + logit_tensor, + optimizer_attrs}; LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, @@ -70,10 +75,12 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { tensor_backing_map.insert({label_tensor, label_backing}); SUBCASE("LossFunction::CATEGORICAL_CROSSENTROPY") { - ModelTrainingInstance model_training_instance = ModelTrainingInstance{ - LossAttrs{OtherLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}, - label_tensor, - logit_tensor}; + std::optional model_training_instance = + ModelTrainingInstance{LossAttrs{OtherLossAttrs{ + LossFunction::CATEGORICAL_CROSSENTROPY}}, + label_tensor, + logit_tensor, + optimizer_attrs}; LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, @@ -85,11 +92,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { } SUBCASE("LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE") { - ModelTrainingInstance model_training_instance = ModelTrainingInstance{ - LossAttrs{ - OtherLossAttrs{LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, - label_tensor, - logit_tensor}; + std::optional model_training_instance = + ModelTrainingInstance{ + LossAttrs{OtherLossAttrs{ + LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, + label_tensor, + logit_tensor, + optimizer_attrs}; LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, @@ -101,10 +110,12 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { } SUBCASE("LossFunction::IDENTITY") { - ModelTrainingInstance model_training_instance = ModelTrainingInstance{ - LossAttrs{OtherLossAttrs{LossFunction::IDENTITY}}, - label_tensor, - logit_tensor}; + std::optional model_training_instance = + ModelTrainingInstance{ + LossAttrs{OtherLossAttrs{LossFunction::IDENTITY}}, + label_tensor, + logit_tensor, + optimizer_attrs}; LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, diff --git a/lib/pcg/include/pcg/computation_graph.h b/lib/pcg/include/pcg/computation_graph.h index 46d5b22afb..6fbac987ec 100644 --- a/lib/pcg/include/pcg/computation_graph.h +++ b/lib/pcg/include/pcg/computation_graph.h @@ -32,6 +32,10 @@ LayerAttrs get_layer_attrs(ComputationGraph const &cg, layer_guid_t const &n); layer_guid_t get_layer_by_name(ComputationGraph const &cg, std::string const &name); +std::vector + get_new_tensor_guids_for_layer_without_graph_insertion( + ComputationGraph const &, layer_guid_t const &n, int num_tensors); + } // namespace FlexFlow #endif diff --git a/lib/pcg/include/pcg/optimizer_attrs.h b/lib/pcg/include/pcg/optimizer_attrs.h new file mode 100644 index 0000000000..b154116a4d --- /dev/null +++ b/lib/pcg/include/pcg/optimizer_attrs.h @@ -0,0 +1,13 @@ +#ifndef _FLEXFLOW_LIB_PCG_INCLUDE_PCG_OPTIMIZER_ATTRS_H +#define _FLEXFLOW_LIB_PCG_INCLUDE_PCG_OPTIMIZER_ATTRS_H + +#include "pcg/optimizer_attrs.dtg.h" + +namespace FlexFlow { + +OptimizerAttrs make_empty_sgd_attrs(); +OptimizerAttrs make_empty_adam_attrs(); + +} // namespace FlexFlow + +#endif diff --git a/lib/pcg/src/pcg/computation_graph.cc b/lib/pcg/src/pcg/computation_graph.cc index afa1774858..23ddd98f3c 100644 --- a/lib/pcg/src/pcg/computation_graph.cc +++ b/lib/pcg/src/pcg/computation_graph.cc @@ -64,4 +64,17 @@ layer_guid_t get_layer_by_name(ComputationGraph const &cg, return get_only(found); } +std::vector + get_new_tensor_guids_for_layer_without_graph_insertion( + ComputationGraph const &cg, layer_guid_t const &n, int num_tensors) { + std::vector new_tensor_guids; + int num_outgoing_tensors = get_outgoing_tensors(cg, n).size(); + + for (int i = 0; i < num_tensors; ++i) { + new_tensor_guids.push_back( + tensor_guid_t{DataflowOutput{n.raw_node, num_outgoing_tensors + i}}); + } + return new_tensor_guids; +} + } // namespace FlexFlow diff --git a/lib/pcg/src/pcg/optimizer_attrs.cc b/lib/pcg/src/pcg/optimizer_attrs.cc new file mode 100644 index 0000000000..a1c2a2e6d4 --- /dev/null +++ b/lib/pcg/src/pcg/optimizer_attrs.cc @@ -0,0 +1,14 @@ +#include "pcg/optimizer_attrs.h" + +namespace FlexFlow { + +OptimizerAttrs make_empty_sgd_attrs() { + return OptimizerAttrs{SGDOptimizerAttrs{0.0, 0.0, false, 0.0}}; +} + +OptimizerAttrs make_empty_adam_attrs() { + return OptimizerAttrs{AdamOptimizerAttrs{0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0}}; +} + +} // namespace FlexFlow From ad9b9eac557d1d84f3226019a62fddbe3b163cef Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 27 Aug 2024 14:04:56 -0700 Subject: [PATCH 07/91] Format --- .../local-execution/model_training_instance.h | 4 +-- .../src/model_training_instance.cc | 31 ++++++++++--------- lib/pcg/include/pcg/optimizer_attrs.h | 2 +- lib/pcg/src/pcg/optimizer_attrs.cc | 6 ++-- 4 files changed, 23 insertions(+), 20 deletions(-) diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h index 7ea027a636..afc8fa7472 100644 --- a/lib/local-execution/include/local-execution/model_training_instance.h +++ b/lib/local-execution/include/local-execution/model_training_instance.h @@ -5,8 +5,8 @@ #include "local-execution/model_training_instance.dtg.h" namespace FlexFlow { - -ModelTrainingInstance next(ModelTrainingInstance const & old); + +ModelTrainingInstance next(ModelTrainingInstance const &old); } // namespace FlexFlow diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc index 646e3ac588..d34cc5d49a 100644 --- a/lib/local-execution/src/model_training_instance.cc +++ b/lib/local-execution/src/model_training_instance.cc @@ -1,24 +1,27 @@ #include "local-execution/model_training_instance.h" namespace FlexFlow { - -ModelTrainingInstance next(ModelTrainingInstance const & old_training_instance) { + +ModelTrainingInstance next(ModelTrainingInstance const &old_training_instance) { if (old_training_instance.optimizer_attrs.has()) { - AdamOptimizerAttrs old = old_training_instance.optimizer_attrs.get(); + AdamOptimizerAttrs old = + old_training_instance.optimizer_attrs.get(); double new_beta1_t = old.beta_t * old.beta1; double new_beta2_t = old.beta2_t * old.beta2; double new_alpha_t = old.alpha * sqrt(1 - new_beta2_t) / (1 - new_beta1_t); - OptimizerAttrs new_attrs = OptimizerAttrs{AdamOptimizerAttrs{ - old.alpha, - old.beta1, - old.beta2, - old.weight_decay, - new_alpha_t, - new_beta1_t, - new_beta2_t, - old.epsilon - }}; - return ModelTrainingInstance{old_training_instance.loss_attrs, old_training_instance.label_tensor, old_training_instance.logit_tensor, new_attrs}; + OptimizerAttrs new_attrs = + OptimizerAttrs{AdamOptimizerAttrs{old.alpha, + old.beta1, + old.beta2, + old.weight_decay, + new_alpha_t, + new_beta1_t, + new_beta2_t, + old.epsilon}}; + return ModelTrainingInstance{old_training_instance.loss_attrs, + old_training_instance.label_tensor, + old_training_instance.logit_tensor, + new_attrs}; } return old_training_instance; } diff --git a/lib/pcg/include/pcg/optimizer_attrs.h b/lib/pcg/include/pcg/optimizer_attrs.h index b154116a4d..550bf12cc8 100644 --- a/lib/pcg/include/pcg/optimizer_attrs.h +++ b/lib/pcg/include/pcg/optimizer_attrs.h @@ -4,7 +4,7 @@ #include "pcg/optimizer_attrs.dtg.h" namespace FlexFlow { - + OptimizerAttrs make_empty_sgd_attrs(); OptimizerAttrs make_empty_adam_attrs(); diff --git a/lib/pcg/src/pcg/optimizer_attrs.cc b/lib/pcg/src/pcg/optimizer_attrs.cc index a1c2a2e6d4..d51070b10d 100644 --- a/lib/pcg/src/pcg/optimizer_attrs.cc +++ b/lib/pcg/src/pcg/optimizer_attrs.cc @@ -1,14 +1,14 @@ #include "pcg/optimizer_attrs.h" namespace FlexFlow { - + OptimizerAttrs make_empty_sgd_attrs() { return OptimizerAttrs{SGDOptimizerAttrs{0.0, 0.0, false, 0.0}}; } OptimizerAttrs make_empty_adam_attrs() { - return OptimizerAttrs{AdamOptimizerAttrs{0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0}}; + return OptimizerAttrs{ + AdamOptimizerAttrs{0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}}; } } // namespace FlexFlow From 1ddfadeebdcdcdabe8a84a03ec51fb5bcb02bfd4 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 27 Aug 2024 14:17:21 -0700 Subject: [PATCH 08/91] E2E update test --- ...test_loss_function.cc => test_loss_e2e.cc} | 2 +- .../test/src/test_update_e2e.cc | 128 ++++++++++++++++++ 2 files changed, 129 insertions(+), 1 deletion(-) rename lib/local-execution/test/src/{test_loss_function.cc => test_loss_e2e.cc} (99%) create mode 100644 lib/local-execution/test/src/test_update_e2e.cc diff --git a/lib/local-execution/test/src/test_loss_function.cc b/lib/local-execution/test/src/test_loss_e2e.cc similarity index 99% rename from lib/local-execution/test/src/test_loss_function.cc rename to lib/local-execution/test/src/test_loss_e2e.cc index 3d9946c89c..15bf089b6b 100644 --- a/lib/local-execution/test/src/test_loss_function.cc +++ b/lib/local-execution/test/src/test_loss_e2e.cc @@ -10,7 +10,7 @@ namespace FlexFlow { TEST_SUITE(FF_CUDA_TEST_SUITE) { - TEST_CASE("Loss Function Local Execution") { + TEST_CASE("Local Execution E2E") { // initialize runtime configs ManagedPerDeviceFFHandle managed_handle{}; diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc new file mode 100644 index 0000000000..7f7a90d9a3 --- /dev/null +++ b/lib/local-execution/test/src/test_update_e2e.cc @@ -0,0 +1,128 @@ +#include "doctest/doctest.h" +#include "kernels/local_cuda_allocator.h" +#include "kernels/managed_ff_stream.h" +#include "kernels/managed_per_device_ff_handle.h" +#include "local-execution/local_training_backing.h" +#include "pcg/computation_graph_builder.h" +#include "pcg/optimizer_attrs.h" +#include "test_utils.h" + +namespace FlexFlow { + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("Local Execution Update E2E") { + // initialize runtime configs + ManagedPerDeviceFFHandle managed_handle{}; + + RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ + DeviceSpecific::create(managed_handle.raw_handle()), + EnableProfiling::NO, + ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/0}}; + + // construct graph + ComputationGraphBuilder cg_builder; + + size_t batch_size = 10; + size_t data_dim = 100; + TensorShape input_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; + tensor_guid_t input_tensor = + cg_builder.create_tensor(input_shape, CreateGrad::YES); + + float scalar = 4.0; + tensor_guid_t logit_tensor = + cg_builder.scalar_multiply(input_tensor, scalar); + + // allocate memory + Allocator allocator = create_local_cuda_memory_allocator(); + TensorBackingMap tensor_backing_map; + GenericTensorAccessorW input_backing = + allocator.allocate_tensor(input_shape); + tensor_backing_map.insert({input_tensor, input_backing}); + + tensor_guid_t label_tensor = + cg_builder.create_tensor(input_shape, CreateGrad::NO); + GenericTensorAccessorW label_backing = + allocator.allocate_tensor(input_shape); + tensor_backing_map.insert({label_tensor, label_backing}); + + SUBCASE("SGDOptimizerAttrs") { + SUBCASE("momentum=0") { + OptimizerAttrs optimizer_attrs = OptimizerAttrs{SGDOptimizerAttrs{ + /*lr=*/0.001, + /*momentum=*/0.0f, + /*nesterov=*/false, + /*weight_decay=*/0.001 + }}; + std::optional model_training_instance = + ModelTrainingInstance{ + LossAttrs{OtherLossAttrs{ + LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, + label_tensor, + logit_tensor, optimizer_attrs}; + LocalTrainingBacking local_backing(allocator, + cg_builder.computation_graph, + tensor_backing_map, + runtime_arg_config, + model_training_instance); + local_backing.execute_init(); + local_backing.execute_forward(); + local_backing.execute_backward(); + local_backing.execute_update(); + } + SUBCASE("momentum=0.9") { + OptimizerAttrs optimizer_attrs = OptimizerAttrs{SGDOptimizerAttrs{ + /*lr=*/0.001, + /*momentum=*/0.9, + /*nesterov=*/false, + /*weight_decay=*/0.001 + }}; + std::optional model_training_instance = + ModelTrainingInstance{ + LossAttrs{OtherLossAttrs{ + LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, + label_tensor, + logit_tensor, optimizer_attrs}; + LocalTrainingBacking local_backing(allocator, + cg_builder.computation_graph, + tensor_backing_map, + runtime_arg_config, + model_training_instance); + local_backing.execute_init(); + local_backing.execute_forward(); + local_backing.execute_backward(); + local_backing.execute_update(); + } + } + SUBCASE("AdamOptimizerAttrs") { + OptimizerAttrs optimizer_attrs = OptimizerAttrs{AdamOptimizerAttrs{ + /*alpha=*/ 0.001, + /*beta1=*/ 0.9, + /*beta2=*/ 0.999, + /*weight_decay=*/ 0.001, + /*alpha_t=*/ 0.001, + /*beta_t=*/ 0.9, + /*beta2_t=*/ 0.999, + /*epsilon=*/ 1e-8 + } + }; + std::optional model_training_instance = + ModelTrainingInstance{ + LossAttrs{OtherLossAttrs{ + LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, + label_tensor, + logit_tensor, optimizer_attrs}; + LocalTrainingBacking local_backing(allocator, + cg_builder.computation_graph, + tensor_backing_map, + runtime_arg_config, + model_training_instance); + local_backing.execute_init(); + local_backing.execute_forward(); + local_backing.execute_backward(); + local_backing.execute_update(); + } + } +} + +} // namespace FlexFlow From dde9496ada1c18ece558d9ac1b9bb38fbc147417 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 27 Aug 2024 15:23:00 -0700 Subject: [PATCH 09/91] Format --- .../test/src/test_update_e2e.cc | 79 +++++++++---------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc index 7f7a90d9a3..3899f60b83 100644 --- a/lib/local-execution/test/src/test_update_e2e.cc +++ b/lib/local-execution/test/src/test_update_e2e.cc @@ -48,46 +48,46 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { SUBCASE("SGDOptimizerAttrs") { SUBCASE("momentum=0") { - OptimizerAttrs optimizer_attrs = OptimizerAttrs{SGDOptimizerAttrs{ - /*lr=*/0.001, - /*momentum=*/0.0f, - /*nesterov=*/false, - /*weight_decay=*/0.001 - }}; + OptimizerAttrs optimizer_attrs = + OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, + /*momentum=*/0.0f, + /*nesterov=*/false, + /*weight_decay=*/0.001}}; std::optional model_training_instance = ModelTrainingInstance{ LossAttrs{OtherLossAttrs{ LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, label_tensor, - logit_tensor, optimizer_attrs}; + logit_tensor, + optimizer_attrs}; LocalTrainingBacking local_backing(allocator, - cg_builder.computation_graph, - tensor_backing_map, - runtime_arg_config, - model_training_instance); + cg_builder.computation_graph, + tensor_backing_map, + runtime_arg_config, + model_training_instance); local_backing.execute_init(); local_backing.execute_forward(); local_backing.execute_backward(); local_backing.execute_update(); } SUBCASE("momentum=0.9") { - OptimizerAttrs optimizer_attrs = OptimizerAttrs{SGDOptimizerAttrs{ - /*lr=*/0.001, - /*momentum=*/0.9, - /*nesterov=*/false, - /*weight_decay=*/0.001 - }}; + OptimizerAttrs optimizer_attrs = + OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, + /*momentum=*/0.9, + /*nesterov=*/false, + /*weight_decay=*/0.001}}; std::optional model_training_instance = ModelTrainingInstance{ LossAttrs{OtherLossAttrs{ LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, label_tensor, - logit_tensor, optimizer_attrs}; + logit_tensor, + optimizer_attrs}; LocalTrainingBacking local_backing(allocator, - cg_builder.computation_graph, - tensor_backing_map, - runtime_arg_config, - model_training_instance); + cg_builder.computation_graph, + tensor_backing_map, + runtime_arg_config, + model_training_instance); local_backing.execute_init(); local_backing.execute_forward(); local_backing.execute_backward(); @@ -95,28 +95,27 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { } } SUBCASE("AdamOptimizerAttrs") { - OptimizerAttrs optimizer_attrs = OptimizerAttrs{AdamOptimizerAttrs{ - /*alpha=*/ 0.001, - /*beta1=*/ 0.9, - /*beta2=*/ 0.999, - /*weight_decay=*/ 0.001, - /*alpha_t=*/ 0.001, - /*beta_t=*/ 0.9, - /*beta2_t=*/ 0.999, - /*epsilon=*/ 1e-8 - } - }; + OptimizerAttrs optimizer_attrs = + OptimizerAttrs{AdamOptimizerAttrs{/*alpha=*/0.001, + /*beta1=*/0.9, + /*beta2=*/0.999, + /*weight_decay=*/0.001, + /*alpha_t=*/0.001, + /*beta_t=*/0.9, + /*beta2_t=*/0.999, + /*epsilon=*/1e-8}}; std::optional model_training_instance = ModelTrainingInstance{ - LossAttrs{OtherLossAttrs{ - LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, + LossAttrs{ + OtherLossAttrs{LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, label_tensor, - logit_tensor, optimizer_attrs}; + logit_tensor, + optimizer_attrs}; LocalTrainingBacking local_backing(allocator, - cg_builder.computation_graph, - tensor_backing_map, - runtime_arg_config, - model_training_instance); + cg_builder.computation_graph, + tensor_backing_map, + runtime_arg_config, + model_training_instance); local_backing.execute_init(); local_backing.execute_forward(); local_backing.execute_backward(); From 59635d827e02dfcc26274784c9d7315985bf86cb Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 11 Sep 2024 12:59:05 -0700 Subject: [PATCH 10/91] Small fixes --- lib/kernels/src/cuda/cuda_helper.cu | 10 +-- lib/kernels/src/device.h | 1 - .../fwd_bwd_op_task_impl_function.h | 32 ++++++++++ .../fwd_bwd_task_impl_function.h | 32 ---------- .../init_op_task_impl_function.h | 33 ++++++++++ .../local-execution/init_task_impl_function.h | 33 ---------- .../model_training_instance.struct.toml | 1 - .../include/local-execution/optimizer.h | 8 +-- .../task_impl_function.variant.toml | 12 ++-- .../include/local-execution/task_signature.h | 23 ++++--- .../task_signature.struct.toml | 9 ++- .../src/fwd_bwd_op_task_impl_function.cc | 54 ++++++++++++++++ .../src/fwd_bwd_task_impl_function.cc | 54 ---------------- .../src/init_op_task_impl_function.cc | 47 ++++++++++++++ .../src/init_task_impl_function.cc | 47 -------------- .../src/local_slots_backing.cc | 64 ++++++++----------- .../src/local_training_backing.cc | 8 +-- .../src/model_training_instance.cc | 3 +- lib/local-execution/src/ops/attention.cc | 6 +- lib/local-execution/src/ops/batch_matmul.cc | 4 +- lib/local-execution/src/ops/batch_norm.cc | 6 +- lib/local-execution/src/ops/cast.cc | 4 +- lib/local-execution/src/ops/combine.cc | 4 +- lib/local-execution/src/ops/concat.cc | 4 +- lib/local-execution/src/ops/conv_2d.cc | 6 +- lib/local-execution/src/ops/dropout.cc | 6 +- lib/local-execution/src/ops/element_binary.cc | 6 +- lib/local-execution/src/ops/element_unary.cc | 6 +- lib/local-execution/src/ops/flat.cc | 4 +- lib/local-execution/src/ops/gather.cc | 6 +- lib/local-execution/src/ops/layer_norm.cc | 6 +- lib/local-execution/src/ops/linear.cc | 6 +- lib/local-execution/src/ops/pool_2d.cc | 6 +- lib/local-execution/src/ops/reduce.cc | 6 +- lib/local-execution/src/ops/reduction.cc | 4 +- lib/local-execution/src/ops/repartition.cc | 6 +- lib/local-execution/src/ops/replicate.cc | 4 +- lib/local-execution/src/ops/reshape.cc | 6 +- lib/local-execution/src/ops/reverse.cc | 4 +- lib/local-execution/src/ops/softmax.cc | 6 +- lib/local-execution/src/ops/split.cc | 4 +- lib/local-execution/src/ops/topk.cc | 6 +- lib/local-execution/src/ops/transpose.cc | 6 +- lib/local-execution/src/optimizer.cc | 17 ++--- lib/local-execution/src/task_invocation.cc | 3 +- lib/local-execution/test/src/test_loss_e2e.cc | 12 ++-- .../test/src/test_update_e2e.cc | 6 +- .../op-attrs/ops/loss_attrs.variant.toml | 6 +- .../include/op-attrs/ops/loss_functions.h | 10 +-- ...=> nonconfigurable_loss_attrs.struct.toml} | 2 +- lib/op-attrs/src/loss_functions.cc | 2 +- lib/pcg/include/pcg/optimizer_attrs.h | 13 ---- lib/pcg/src/pcg/optimizer_attrs.cc | 14 ---- 53 files changed, 327 insertions(+), 361 deletions(-) create mode 100644 lib/local-execution/include/local-execution/fwd_bwd_op_task_impl_function.h delete mode 100644 lib/local-execution/include/local-execution/fwd_bwd_task_impl_function.h create mode 100644 lib/local-execution/include/local-execution/init_op_task_impl_function.h delete mode 100644 lib/local-execution/include/local-execution/init_task_impl_function.h create mode 100644 lib/local-execution/src/fwd_bwd_op_task_impl_function.cc delete mode 100644 lib/local-execution/src/fwd_bwd_task_impl_function.cc create mode 100644 lib/local-execution/src/init_op_task_impl_function.cc delete mode 100644 lib/local-execution/src/init_task_impl_function.cc rename lib/op-attrs/include/op-attrs/ops/{other_loss_attrs.struct.toml => nonconfigurable_loss_attrs.struct.toml} (86%) delete mode 100644 lib/pcg/include/pcg/optimizer_attrs.h delete mode 100644 lib/pcg/src/pcg/optimizer_attrs.cc diff --git a/lib/kernels/src/cuda/cuda_helper.cu b/lib/kernels/src/cuda/cuda_helper.cu index 5a303ca15e..4ad22b3a57 100644 --- a/lib/kernels/src/cuda/cuda_helper.cu +++ b/lib/kernels/src/cuda/cuda_helper.cu @@ -29,19 +29,13 @@ cudaError_t get_legion_stream(cudaStream_t *stream) { #error "Unknown device, please make sure if CUDA is enabled" #endif -__global__ void scale_kernel(float *ptr, coord_t size, float a, float b) { +__global__ void scale_kernel(float *ptr, size_t size, float a, float b) { CUDA_KERNEL_LOOP(i, size) { ptr[i] = (b - a) * ptr[i] + a; } } -__global__ void scale_kernel(float *ptr, unsigned long size, float a, float b) { - CUDA_KERNEL_LOOP(i, size) { - ptr[i] = (b - a) * ptr[i] + a; - } -} - -__global__ void ones_kernel(float *ptr, coord_t size) { +__global__ void ones_kernel(float *ptr, size_t size) { CUDA_KERNEL_LOOP(i, size) { ptr[i] = 1.0f; } diff --git a/lib/kernels/src/device.h b/lib/kernels/src/device.h index e32805fde3..ceff2f92ff 100644 --- a/lib/kernels/src/device.h +++ b/lib/kernels/src/device.h @@ -71,7 +71,6 @@ inline int GET_BLOCKS(int const N) { } __global__ void scale_kernel(float *ptr, size_t size, float a, float b); -__global__ void scale_kernel(float *ptr, unsigned long size, float a, float b); __global__ void ones_kernel(float *ptr, size_t size); diff --git a/lib/local-execution/include/local-execution/fwd_bwd_op_task_impl_function.h b/lib/local-execution/include/local-execution/fwd_bwd_op_task_impl_function.h new file mode 100644 index 0000000000..cc82291f6a --- /dev/null +++ b/lib/local-execution/include/local-execution/fwd_bwd_op_task_impl_function.h @@ -0,0 +1,32 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_FWD_BWD_TASK_IMPL_FUNCTION_H +#define _FLEXFLOW_LOCAL_EXECUTION_FWD_BWD_TASK_IMPL_FUNCTION_H + +#include "local-execution/task_argument_accessor.h" + +namespace FlexFlow { + +struct FwdBwdOpTaskImplFunction { + + std::optional (*function_ptr)(TaskArgumentAccessor const &); + + bool operator==(FwdBwdOpTaskImplFunction const &) const; + bool operator!=(FwdBwdOpTaskImplFunction const &) const; + bool operator<(FwdBwdOpTaskImplFunction const &) const; + bool operator>(FwdBwdOpTaskImplFunction const &) const; + bool operator<=(FwdBwdOpTaskImplFunction const &) const; + bool operator>=(FwdBwdOpTaskImplFunction const &) const; +}; + +std::string format_as(FwdBwdOpTaskImplFunction const &x); +std::ostream &operator<<(std::ostream &s, FwdBwdOpTaskImplFunction const &x); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash<::FlexFlow::FwdBwdOpTaskImplFunction> { + size_t operator()(::FlexFlow::FwdBwdOpTaskImplFunction const &) const; +}; +} // namespace std + +#endif diff --git a/lib/local-execution/include/local-execution/fwd_bwd_task_impl_function.h b/lib/local-execution/include/local-execution/fwd_bwd_task_impl_function.h deleted file mode 100644 index 7f80af77f3..0000000000 --- a/lib/local-execution/include/local-execution/fwd_bwd_task_impl_function.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_FWD_BWD_TASK_IMPL_FUNCTION_H -#define _FLEXFLOW_LOCAL_EXECUTION_FWD_BWD_TASK_IMPL_FUNCTION_H - -#include "local-execution/task_argument_accessor.h" - -namespace FlexFlow { - -struct FwdBwdTaskImplFunction { - - std::optional (*function_ptr)(TaskArgumentAccessor const &); - - bool operator==(FwdBwdTaskImplFunction const &) const; - bool operator!=(FwdBwdTaskImplFunction const &) const; - bool operator<(FwdBwdTaskImplFunction const &) const; - bool operator>(FwdBwdTaskImplFunction const &) const; - bool operator<=(FwdBwdTaskImplFunction const &) const; - bool operator>=(FwdBwdTaskImplFunction const &) const; -}; - -std::string format_as(FwdBwdTaskImplFunction const &x); -std::ostream &operator<<(std::ostream &s, FwdBwdTaskImplFunction const &x); - -} // namespace FlexFlow - -namespace std { -template <> -struct hash<::FlexFlow::FwdBwdTaskImplFunction> { - size_t operator()(::FlexFlow::FwdBwdTaskImplFunction const &) const; -}; -} // namespace std - -#endif diff --git a/lib/local-execution/include/local-execution/init_op_task_impl_function.h b/lib/local-execution/include/local-execution/init_op_task_impl_function.h new file mode 100644 index 0000000000..7b23a2bc64 --- /dev/null +++ b/lib/local-execution/include/local-execution/init_op_task_impl_function.h @@ -0,0 +1,33 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_INIT_TASK_IMPL_FUNCTION_H +#define _FLEXFLOW_LOCAL_EXECUTION_INIT_TASK_IMPL_FUNCTION_H + +#include "local-execution/device_specific_device_states.dtg.h" +#include "local-execution/task_argument_accessor.h" + +namespace FlexFlow { + +struct InitOpTaskImplFunction { + + DeviceSpecificDeviceStates (*function_ptr)(TaskArgumentAccessor const &); + + bool operator==(InitOpTaskImplFunction const &) const; + bool operator!=(InitOpTaskImplFunction const &) const; + bool operator<(InitOpTaskImplFunction const &) const; + bool operator>(InitOpTaskImplFunction const &) const; + bool operator<=(InitOpTaskImplFunction const &) const; + bool operator>=(InitOpTaskImplFunction const &) const; +}; + +std::string format_as(InitOpTaskImplFunction const &x); +std::ostream &operator<<(std::ostream &s, InitOpTaskImplFunction const &x); + +} // namespace FlexFlow + +namespace std { +template <> +struct hash<::FlexFlow::InitOpTaskImplFunction> { + size_t operator()(::FlexFlow::InitOpTaskImplFunction const &) const; +}; +} // namespace std + +#endif diff --git a/lib/local-execution/include/local-execution/init_task_impl_function.h b/lib/local-execution/include/local-execution/init_task_impl_function.h deleted file mode 100644 index b85944e13a..0000000000 --- a/lib/local-execution/include/local-execution/init_task_impl_function.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_INIT_TASK_IMPL_FUNCTION_H -#define _FLEXFLOW_LOCAL_EXECUTION_INIT_TASK_IMPL_FUNCTION_H - -#include "local-execution/device_specific_device_states.dtg.h" -#include "local-execution/task_argument_accessor.h" - -namespace FlexFlow { - -struct InitTaskImplFunction { - - DeviceSpecificDeviceStates (*function_ptr)(TaskArgumentAccessor const &); - - bool operator==(InitTaskImplFunction const &) const; - bool operator!=(InitTaskImplFunction const &) const; - bool operator<(InitTaskImplFunction const &) const; - bool operator>(InitTaskImplFunction const &) const; - bool operator<=(InitTaskImplFunction const &) const; - bool operator>=(InitTaskImplFunction const &) const; -}; - -std::string format_as(InitTaskImplFunction const &x); -std::ostream &operator<<(std::ostream &s, InitTaskImplFunction const &x); - -} // namespace FlexFlow - -namespace std { -template <> -struct hash<::FlexFlow::InitTaskImplFunction> { - size_t operator()(::FlexFlow::InitTaskImplFunction const &) const; -}; -} // namespace std - -#endif diff --git a/lib/local-execution/include/local-execution/model_training_instance.struct.toml b/lib/local-execution/include/local-execution/model_training_instance.struct.toml index e3ff397e39..b460d6bd3a 100644 --- a/lib/local-execution/include/local-execution/model_training_instance.struct.toml +++ b/lib/local-execution/include/local-execution/model_training_instance.struct.toml @@ -8,7 +8,6 @@ features = [ ] includes = [ - "utils/optional.h", "op-attrs/ops/loss_attrs.dtg.h", "pcg/tensor_guid_t.dtg.h", "pcg/optimizer_attrs.dtg.h", diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h index 53dcad63de..e1f11b8a68 100644 --- a/lib/local-execution/include/local-execution/optimizer.h +++ b/lib/local-execution/include/local-execution/optimizer.h @@ -13,20 +13,20 @@ namespace FlexFlow { TaskSignature get_update_signature(OptimizerAttrs const &); TaskInvocation get_update_invocation(OptimizerAttrs const &, tensor_guid_t const &weight, - std::vector const &); + std::vector const &grad_buffer_tensors); TaskImplFunction get_update_task_impl(OptimizerAttrs const &); TaskSignature get_sgd_update_signature(); TaskInvocation sgd_update(SGDOptimizerAttrs const &, tensor_guid_t const &weight, - tensor_guid_t const &); + tensor_guid_t const &sgd_v); TaskImplFunction get_sgd_update_task_impl(); TaskSignature get_adam_update_signature(); TaskInvocation adam_update(AdamOptimizerAttrs const &, tensor_guid_t const &weight, - tensor_guid_t const &, - tensor_guid_t const &); + tensor_guid_t const &adam_v, + tensor_guid_t const &adam_m); TaskImplFunction get_adam_update_task_impl(); } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/task_impl_function.variant.toml b/lib/local-execution/include/local-execution/task_impl_function.variant.toml index 1be18bebfa..48cab9eb01 100644 --- a/lib/local-execution/include/local-execution/task_impl_function.variant.toml +++ b/lib/local-execution/include/local-execution/task_impl_function.variant.toml @@ -8,18 +8,18 @@ features = [ ] includes = [ - "local-execution/init_task_impl_function.h", - "local-execution/fwd_bwd_task_impl_function.h", + "local-execution/init_op_task_impl_function.h", + "local-execution/fwd_bwd_op_task_impl_function.h", "local-execution/generic_task_impl_function.h", ] [[values]] -type = "::FlexFlow::InitTaskImplFunction" -key = "init_task_impl_function" +type = "::FlexFlow::InitOpTaskImplFunction" +key = "init_op_task_impl_function" [[values]] -type = "::FlexFlow::FwdBwdTaskImplFunction" -key = "fwd_bwd_task_impl_function" +type = "::FlexFlow::FwdBwdOpTaskImplFunction" +key = "fwd_bwd_op_task_impl_function" [[values]] type = "::FlexFlow::GenericTaskImplFunction" diff --git a/lib/local-execution/include/local-execution/task_signature.h b/lib/local-execution/include/local-execution/task_signature.h index d31a67e027..ed28f8eaea 100644 --- a/lib/local-execution/include/local-execution/task_signature.h +++ b/lib/local-execution/include/local-execution/task_signature.h @@ -1,13 +1,8 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_SIGNATURE_H #define _FLEXFLOW_LOCAL_EXECUTION_TASK_SIGNATURE_H -// #include "local-execution/tensor_guid_slot_spec.dtg.h" -// #include "local-execution/serialization.h" -// #include "utils/hash/unordered_map.h" -// #include "utils/hash/unordered_set.h" -// #include "utils/type_index.h" - #include "local-execution/task_signature.dtg.h" +#include "utils/type_index.h" namespace FlexFlow { @@ -38,15 +33,23 @@ void add_return_value(TaskSignature &task_signature) { task_signature.return_value = get_type_index_for_type(); } -// adds arg_slot without checking is_serializable, used for arguments that are -// deviceSpecific +/** + * @brief Adds an argument slot without checking if it is serializable. + * + * This function is used for arguments that are device-specific. + */ + template void add_unchecked_arg_slot(TaskSignature &task_signature, int name) { add_unchecked_arg_slot(task_signature, slot_id_t{name}); } -// adds arg_slot without checking is_serializable, used for arguments that are -// deviceSpecific +/** + * @brief Adds an argument slot without checking if it is serializable. + * + * This function is used for arguments that are device-specific. + */ + template void add_unchecked_arg_slot(TaskSignature &task_signature, slot_id_t name) { task_signature.task_arg_types.insert({name, get_type_index_for_type()}); diff --git a/lib/local-execution/include/local-execution/task_signature.struct.toml b/lib/local-execution/include/local-execution/task_signature.struct.toml index f86f7b0c57..fd15df91d5 100644 --- a/lib/local-execution/include/local-execution/task_signature.struct.toml +++ b/lib/local-execution/include/local-execution/task_signature.struct.toml @@ -3,17 +3,22 @@ name = "TaskSignature" features = [ "eq", "fmt", + "hash" ] includes = [ "local-execution/tensor_guid_slot_spec.dtg.h", - "utils/type_index.h", - "utils/optional.h" + "", + "" ] src_includes = [ "utils/fmt/unordered_map.h", "utils/fmt/unordered_set.h", + "utils/hash/unordered_map.h", + "utils/hash/unordered_set.h", + "utils/fmt/optional.h", + "utils/type_index.h" ] [[fields]] diff --git a/lib/local-execution/src/fwd_bwd_op_task_impl_function.cc b/lib/local-execution/src/fwd_bwd_op_task_impl_function.cc new file mode 100644 index 0000000000..308dbfd3ae --- /dev/null +++ b/lib/local-execution/src/fwd_bwd_op_task_impl_function.cc @@ -0,0 +1,54 @@ +#include "local-execution/fwd_bwd_op_task_impl_function.h" + +namespace FlexFlow { + +bool FwdBwdOpTaskImplFunction::operator==( + FwdBwdOpTaskImplFunction const &other) const { + return this->function_ptr == other.function_ptr; +} + +bool FwdBwdOpTaskImplFunction::operator!=( + FwdBwdOpTaskImplFunction const &other) const { + return this->function_ptr != other.function_ptr; +} + +bool FwdBwdOpTaskImplFunction::operator<( + FwdBwdOpTaskImplFunction const &other) const { + return this->function_ptr < other.function_ptr; +} + +bool FwdBwdOpTaskImplFunction::operator>( + FwdBwdOpTaskImplFunction const &other) const { + return this->function_ptr > other.function_ptr; +} + +bool FwdBwdOpTaskImplFunction::operator<=( + FwdBwdOpTaskImplFunction const &other) const { + return this->function_ptr <= other.function_ptr; +} + +bool FwdBwdOpTaskImplFunction::operator>=( + FwdBwdOpTaskImplFunction const &other) const { + return this->function_ptr >= other.function_ptr; +} + +std::string format_as(FwdBwdOpTaskImplFunction const &x) { + std::ostringstream oss; + oss << ""; + return oss.str(); +} + +std::ostream &operator<<(std::ostream &s, FwdBwdOpTaskImplFunction const &x) { + return s << fmt::to_string(x); +} + +} // namespace FlexFlow + +namespace std { +size_t hash::operator()( + ::FlexFlow::FwdBwdOpTaskImplFunction const &x) const { + return std::hash{}(x.function_ptr); +} +} // namespace std diff --git a/lib/local-execution/src/fwd_bwd_task_impl_function.cc b/lib/local-execution/src/fwd_bwd_task_impl_function.cc deleted file mode 100644 index f85d7cec61..0000000000 --- a/lib/local-execution/src/fwd_bwd_task_impl_function.cc +++ /dev/null @@ -1,54 +0,0 @@ -#include "local-execution/fwd_bwd_task_impl_function.h" - -namespace FlexFlow { - -bool FwdBwdTaskImplFunction::operator==( - FwdBwdTaskImplFunction const &other) const { - return this->function_ptr == other.function_ptr; -} - -bool FwdBwdTaskImplFunction::operator!=( - FwdBwdTaskImplFunction const &other) const { - return this->function_ptr != other.function_ptr; -} - -bool FwdBwdTaskImplFunction::operator<( - FwdBwdTaskImplFunction const &other) const { - return this->function_ptr < other.function_ptr; -} - -bool FwdBwdTaskImplFunction::operator>( - FwdBwdTaskImplFunction const &other) const { - return this->function_ptr > other.function_ptr; -} - -bool FwdBwdTaskImplFunction::operator<=( - FwdBwdTaskImplFunction const &other) const { - return this->function_ptr <= other.function_ptr; -} - -bool FwdBwdTaskImplFunction::operator>=( - FwdBwdTaskImplFunction const &other) const { - return this->function_ptr >= other.function_ptr; -} - -std::string format_as(FwdBwdTaskImplFunction const &x) { - std::ostringstream oss; - oss << ""; - return oss.str(); -} - -std::ostream &operator<<(std::ostream &s, FwdBwdTaskImplFunction const &x) { - return s << fmt::to_string(x); -} - -} // namespace FlexFlow - -namespace std { -size_t hash::operator()( - ::FlexFlow::FwdBwdTaskImplFunction const &x) const { - return std::hash{}(x.function_ptr); -} -} // namespace std diff --git a/lib/local-execution/src/init_op_task_impl_function.cc b/lib/local-execution/src/init_op_task_impl_function.cc new file mode 100644 index 0000000000..1c946982f5 --- /dev/null +++ b/lib/local-execution/src/init_op_task_impl_function.cc @@ -0,0 +1,47 @@ +#include "local-execution/init_op_task_impl_function.h" + +namespace FlexFlow { + +bool InitOpTaskImplFunction::operator==(InitOpTaskImplFunction const &other) const { + return this->function_ptr == other.function_ptr; +} + +bool InitOpTaskImplFunction::operator!=(InitOpTaskImplFunction const &other) const { + return this->function_ptr != other.function_ptr; +} + +bool InitOpTaskImplFunction::operator<(InitOpTaskImplFunction const &other) const { + return this->function_ptr < other.function_ptr; +} + +bool InitOpTaskImplFunction::operator>(InitOpTaskImplFunction const &other) const { + return this->function_ptr > other.function_ptr; +} + +bool InitOpTaskImplFunction::operator<=(InitOpTaskImplFunction const &other) const { + return this->function_ptr <= other.function_ptr; +} + +bool InitOpTaskImplFunction::operator>=(InitOpTaskImplFunction const &other) const { + return this->function_ptr >= other.function_ptr; +} + +std::string format_as(InitOpTaskImplFunction const &x) { + std::ostringstream oss; + oss << ""; + return oss.str(); +} +std::ostream &operator<<(std::ostream &s, InitOpTaskImplFunction const &x) { + return s << fmt::to_string(x); +} + +} // namespace FlexFlow + +namespace std { +size_t hash::operator()( + ::FlexFlow::InitOpTaskImplFunction const &x) const { + return std::hash{}(x.function_ptr); +} +} // namespace std diff --git a/lib/local-execution/src/init_task_impl_function.cc b/lib/local-execution/src/init_task_impl_function.cc deleted file mode 100644 index 9501f72dd6..0000000000 --- a/lib/local-execution/src/init_task_impl_function.cc +++ /dev/null @@ -1,47 +0,0 @@ -#include "local-execution/init_task_impl_function.h" - -namespace FlexFlow { - -bool InitTaskImplFunction::operator==(InitTaskImplFunction const &other) const { - return this->function_ptr == other.function_ptr; -} - -bool InitTaskImplFunction::operator!=(InitTaskImplFunction const &other) const { - return this->function_ptr != other.function_ptr; -} - -bool InitTaskImplFunction::operator<(InitTaskImplFunction const &other) const { - return this->function_ptr < other.function_ptr; -} - -bool InitTaskImplFunction::operator>(InitTaskImplFunction const &other) const { - return this->function_ptr > other.function_ptr; -} - -bool InitTaskImplFunction::operator<=(InitTaskImplFunction const &other) const { - return this->function_ptr <= other.function_ptr; -} - -bool InitTaskImplFunction::operator>=(InitTaskImplFunction const &other) const { - return this->function_ptr >= other.function_ptr; -} - -std::string format_as(InitTaskImplFunction const &x) { - std::ostringstream oss; - oss << ""; - return oss.str(); -} -std::ostream &operator<<(std::ostream &s, InitTaskImplFunction const &x) { - return s << fmt::to_string(x); -} - -} // namespace FlexFlow - -namespace std { -size_t hash::operator()( - ::FlexFlow::InitTaskImplFunction const &x) const { - return std::hash{}(x.function_ptr); -} -} // namespace std diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc index 5059f29abd..0a1497b6c8 100644 --- a/lib/local-execution/src/local_slots_backing.cc +++ b/lib/local-execution/src/local_slots_backing.cc @@ -1,5 +1,6 @@ #include "local-execution/local_slots_backing.h" #include "utils/containers/contains_key.h" +#include "utils/containers/map_values.h" #include "utils/overload.h" namespace FlexFlow { @@ -55,17 +56,17 @@ void LocalSlotsBacking::allocate_optimizer_tensors( TaskSignature const &sig) { GenericTensorAccessorW weight_backing = get_tensor_backing(weight, IsGrad::NO); - int num_buffer_tensors = + int num_grad_buffer_tensors = sig.tensor_guid_slots.size() - 2; // ignore 2 (weight and weight_grad) - std::vector buffer_tensors = + std::vector grad_buffer_tensors = get_new_tensor_guids_for_layer_without_graph_insertion( - cg, weight_layer, num_buffer_tensors); - for (auto const &tensor_guid : buffer_tensors) { + cg, weight_layer, num_grad_buffer_tensors); + for (tensor_guid_t const &tensor_guid : grad_buffer_tensors) { GenericTensorAccessorW buffer_backing = allocator.allocate_tensor( get_tensor_shape(weight_backing.shape, weight_backing.data_type)); this->gradient_tensor_mapping.insert({tensor_guid, buffer_backing}); } - this->weight_optimizer_tensor_guids.insert({weight, buffer_tensors}); + this->weight_optimizer_tensor_guids.insert({weight, grad_buffer_tensors}); } bool LocalSlotsBacking::is_tensor_allocated( @@ -123,8 +124,7 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( break; default: throw mk_runtime_error( - fmt::format("Invalid TensorRole")); // inserting role yields - // "type_is_unformattable" error + fmt::format("Invalid TensorRole {}", tensor_spec.role)); } IsGrad is_grad = slot_grad_id.is_grad; @@ -154,41 +154,29 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing( OpTaskBinding const &binding, layer_guid_t const &op_guid) const { - ArgSlotsBacking mapping; - for (auto const &arg_binding : binding.get_arg_bindings()) { - slot_id_t arg_slot = arg_binding.first; - OpArgSpec op_arg_spec = arg_binding.second; - - mapping.insert({arg_slot, - op_arg_spec.visit(overload{ - [&](OpArgRefSpec const &s) { - return this->resolve_op_arg_ref_spec(s, op_guid); - }, - [&](RuntimeArgRefSpec const &s) { - return this->resolve_runtime_arg_ref_spec(s); - }, - [](ConcreteArgSpec const &s) { return s; }, - })}); - } - return mapping; + return map_values(binding.get_arg_bindings(), [&](auto const &arg_binding){ + return arg_binding.template visit(overload{ + [&](OpArgRefSpec const &s) { + return this->resolve_op_arg_ref_spec(s, op_guid); + }, + [&](RuntimeArgRefSpec const &s) { + return this->resolve_runtime_arg_ref_spec(s); + }, + [](ConcreteArgSpec const &s) { return s; } + }); + }); } ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing( TaskBinding const &binding) const { - ArgSlotsBacking mapping; - for (auto const &arg_binding : binding.get_arg_bindings()) { - slot_id_t arg_slot = arg_binding.first; - TaskArgSpec task_arg_spec = arg_binding.second; - - mapping.insert({arg_slot, - task_arg_spec.visit(overload{ - [&](RuntimeArgRefSpec const &s) { - return this->resolve_runtime_arg_ref_spec(s); - }, - [](ConcreteArgSpec const &s) { return s; }, - })}); - } - return mapping; + return map_values(binding.get_arg_bindings(), [&](auto const &arg_binding){ + return arg_binding.template visit(overload{ + [&](RuntimeArgRefSpec const &s) { + return this->resolve_runtime_arg_ref_spec(s); + }, + [](ConcreteArgSpec const &s) { return s; } + }); + });; } ConcreteArgSpec LocalSlotsBacking::resolve_op_arg_ref_spec( diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index eb49f16df1..dff33826b9 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -60,7 +60,7 @@ DeviceSpecificDeviceStates TaskSignatureAndImpl task_sig_impl = this->task_registry.task_mapping.at(task_id); auto fn = - task_sig_impl.impl_function.get().function_ptr; + task_sig_impl.impl_function.get().function_ptr; return fn(acc); } @@ -70,7 +70,7 @@ std::optional TaskSignatureAndImpl task_sig_impl = this->task_registry.task_mapping.at(task_id); auto fn = - task_sig_impl.impl_function.get().function_ptr; + task_sig_impl.impl_function.get().function_ptr; return fn(acc); } @@ -160,13 +160,13 @@ void LocalTrainingBacking::execute_update() { // get tensors tensor_guid_t weight_tensor = get_only(get_outgoing_tensors(this->computation_graph, node)); - std::vector buffer_tensors = + std::vector grad_buffer_tensors = this->local_slots_backing.weight_optimizer_tensor_guids.at( weight_tensor); // get invocation TaskInvocation invocation = - get_update_invocation(attrs, weight_tensor, buffer_tensors); + get_update_invocation(attrs, weight_tensor, grad_buffer_tensors); assert(is_invocation_valid(get_update_signature(attrs), invocation)); // execute update diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc index d34cc5d49a..c626bfc0e0 100644 --- a/lib/local-execution/src/model_training_instance.cc +++ b/lib/local-execution/src/model_training_instance.cc @@ -22,8 +22,9 @@ ModelTrainingInstance next(ModelTrainingInstance const &old_training_instance) { old_training_instance.label_tensor, old_training_instance.logit_tensor, new_attrs}; + } else { + return old_training_instance; } - return old_training_instance; } } // namespace FlexFlow diff --git a/lib/local-execution/src/ops/attention.cc b/lib/local-execution/src/ops/attention.cc index 8ede2cb38b..5e693d43db 100644 --- a/lib/local-execution/src/ops/attention.cc +++ b/lib/local-execution/src/ops/attention.cc @@ -202,13 +202,13 @@ static std::optional } TaskImplFunction get_attention_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_attention_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_attention_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_attention_init_signature() { diff --git a/lib/local-execution/src/ops/batch_matmul.cc b/lib/local-execution/src/ops/batch_matmul.cc index 1eae409ae2..d60a003061 100644 --- a/lib/local-execution/src/ops/batch_matmul.cc +++ b/lib/local-execution/src/ops/batch_matmul.cc @@ -153,10 +153,10 @@ static std::optional } TaskImplFunction get_batch_matmul_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_batch_matmul_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_batch_matmul_fwd_signature() { diff --git a/lib/local-execution/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc index 851566fc02..254d7ef39e 100644 --- a/lib/local-execution/src/ops/batch_norm.cc +++ b/lib/local-execution/src/ops/batch_norm.cc @@ -144,13 +144,13 @@ static std::optional } TaskImplFunction get_batch_norm_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_batch_norm_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_batch_norm_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_batch_norm_init_signature() { diff --git a/lib/local-execution/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc index 3e7baf49a9..d3e43a46a0 100644 --- a/lib/local-execution/src/ops/cast.cc +++ b/lib/local-execution/src/ops/cast.cc @@ -79,10 +79,10 @@ static std::optional } TaskImplFunction get_cast_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_cast_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_cast_fwd_signature() { diff --git a/lib/local-execution/src/ops/combine.cc b/lib/local-execution/src/ops/combine.cc index ccc82cce17..92f2931344 100644 --- a/lib/local-execution/src/ops/combine.cc +++ b/lib/local-execution/src/ops/combine.cc @@ -85,10 +85,10 @@ OpTaskSignature get_combine_bwd_signature() { } TaskImplFunction get_combine_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_combine_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } }; // namespace FlexFlow diff --git a/lib/local-execution/src/ops/concat.cc b/lib/local-execution/src/ops/concat.cc index 35f663b1cd..94d8fc6827 100644 --- a/lib/local-execution/src/ops/concat.cc +++ b/lib/local-execution/src/ops/concat.cc @@ -79,10 +79,10 @@ static std::optional } TaskImplFunction get_concat_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_concat_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_concat_fwd_signature() { diff --git a/lib/local-execution/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc index d5c6e7f851..7694a03947 100644 --- a/lib/local-execution/src/ops/conv_2d.cc +++ b/lib/local-execution/src/ops/conv_2d.cc @@ -132,13 +132,13 @@ static std::optional } TaskImplFunction get_conv_2d_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_conv_2d_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_conv_2d_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_conv_2d_init_signature() { diff --git a/lib/local-execution/src/ops/dropout.cc b/lib/local-execution/src/ops/dropout.cc index cac08866cc..77a2963313 100644 --- a/lib/local-execution/src/ops/dropout.cc +++ b/lib/local-execution/src/ops/dropout.cc @@ -87,13 +87,13 @@ static std::optional } TaskImplFunction get_dropout_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_dropout_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_dropout_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_dropout_init_signature() { diff --git a/lib/local-execution/src/ops/element_binary.cc b/lib/local-execution/src/ops/element_binary.cc index 48c6c699a2..2152b1beea 100644 --- a/lib/local-execution/src/ops/element_binary.cc +++ b/lib/local-execution/src/ops/element_binary.cc @@ -126,15 +126,15 @@ static std::optional } TaskImplFunction get_element_binary_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_element_binary_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_element_binary_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_element_binary_init_signature() { diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc index 502afb5f9f..64a0c5e94e 100644 --- a/lib/local-execution/src/ops/element_unary.cc +++ b/lib/local-execution/src/ops/element_unary.cc @@ -115,13 +115,13 @@ static std::optional } TaskImplFunction get_element_unary_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_element_unary_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_element_unary_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_element_unary_init_signature() { diff --git a/lib/local-execution/src/ops/flat.cc b/lib/local-execution/src/ops/flat.cc index 3fe5029fa1..8df5703f60 100644 --- a/lib/local-execution/src/ops/flat.cc +++ b/lib/local-execution/src/ops/flat.cc @@ -53,10 +53,10 @@ static std::optional } TaskImplFunction get_flat_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_flat_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_flat_fwd_signature() { diff --git a/lib/local-execution/src/ops/gather.cc b/lib/local-execution/src/ops/gather.cc index a015c64f4d..558988f9a4 100644 --- a/lib/local-execution/src/ops/gather.cc +++ b/lib/local-execution/src/ops/gather.cc @@ -122,13 +122,13 @@ static std::optional } TaskImplFunction get_gather_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_gather_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_gather_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_gather_init_signature() { diff --git a/lib/local-execution/src/ops/layer_norm.cc b/lib/local-execution/src/ops/layer_norm.cc index e99d27319c..b1f44d69ae 100644 --- a/lib/local-execution/src/ops/layer_norm.cc +++ b/lib/local-execution/src/ops/layer_norm.cc @@ -146,13 +146,13 @@ static DeviceSpecificDeviceStates } TaskImplFunction get_layer_norm_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_layer_norm_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_layer_norm_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_layer_norm_fwd_signature() { diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc index 9934e2a45c..9e29a0cce0 100644 --- a/lib/local-execution/src/ops/linear.cc +++ b/lib/local-execution/src/ops/linear.cc @@ -161,13 +161,13 @@ static std::optional } TaskImplFunction get_linear_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_linear_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_linear_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_linear_init_signature() { diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc index 789ed2cd63..093a3c1374 100644 --- a/lib/local-execution/src/ops/pool_2d.cc +++ b/lib/local-execution/src/ops/pool_2d.cc @@ -142,13 +142,13 @@ static std::optional } TaskImplFunction get_pool_2d_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_pool_2d_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_pool_2d_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_pool_2d_init_signature() { diff --git a/lib/local-execution/src/ops/reduce.cc b/lib/local-execution/src/ops/reduce.cc index a043d9f847..01d2f0e86f 100644 --- a/lib/local-execution/src/ops/reduce.cc +++ b/lib/local-execution/src/ops/reduce.cc @@ -102,13 +102,13 @@ static std::optional } TaskImplFunction get_reduce_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_reduce_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_reduce_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_reduce_init_signature() { diff --git a/lib/local-execution/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc index a58d79a4f8..f946b7d146 100644 --- a/lib/local-execution/src/ops/reduction.cc +++ b/lib/local-execution/src/ops/reduction.cc @@ -74,10 +74,10 @@ static std::optional } TaskImplFunction get_reduction_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_reduction_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_reduction_fwd_signature() { diff --git a/lib/local-execution/src/ops/repartition.cc b/lib/local-execution/src/ops/repartition.cc index 73692f4a13..e260fd77f5 100644 --- a/lib/local-execution/src/ops/repartition.cc +++ b/lib/local-execution/src/ops/repartition.cc @@ -98,13 +98,13 @@ static std::optional } TaskImplFunction get_repartition_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_repartition_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_repartition_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_repartition_init_signature() { diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc index 135475a711..10cd80a6d9 100644 --- a/lib/local-execution/src/ops/replicate.cc +++ b/lib/local-execution/src/ops/replicate.cc @@ -73,10 +73,10 @@ static std::optional } TaskImplFunction get_replicate_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_replicate_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_replicate_fwd_signature() { diff --git a/lib/local-execution/src/ops/reshape.cc b/lib/local-execution/src/ops/reshape.cc index 7584d405eb..433e961a8a 100644 --- a/lib/local-execution/src/ops/reshape.cc +++ b/lib/local-execution/src/ops/reshape.cc @@ -92,13 +92,13 @@ static std::optional } TaskImplFunction get_reshape_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_reshape_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_reshape_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_reshape_init_signature() { diff --git a/lib/local-execution/src/ops/reverse.cc b/lib/local-execution/src/ops/reverse.cc index 366a579bea..b767b61b20 100644 --- a/lib/local-execution/src/ops/reverse.cc +++ b/lib/local-execution/src/ops/reverse.cc @@ -103,10 +103,10 @@ static std::optional } TaskImplFunction get_reverse_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_reverse_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_reverse_fwd_signature() { diff --git a/lib/local-execution/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc index 4c7979ae9b..36c4afcaf3 100644 --- a/lib/local-execution/src/ops/softmax.cc +++ b/lib/local-execution/src/ops/softmax.cc @@ -108,13 +108,13 @@ static std::optional } TaskImplFunction get_softmax_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_softmax_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_softmax_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_softmax_init_signature() { diff --git a/lib/local-execution/src/ops/split.cc b/lib/local-execution/src/ops/split.cc index 9f039d84f8..dc627aae96 100644 --- a/lib/local-execution/src/ops/split.cc +++ b/lib/local-execution/src/ops/split.cc @@ -114,10 +114,10 @@ static std::optional } TaskImplFunction get_split_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_split_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_split_fwd_signature() { diff --git a/lib/local-execution/src/ops/topk.cc b/lib/local-execution/src/ops/topk.cc index 7f3519529a..ea4fc09e19 100644 --- a/lib/local-execution/src/ops/topk.cc +++ b/lib/local-execution/src/ops/topk.cc @@ -120,13 +120,13 @@ static std::optional } TaskImplFunction get_topk_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_topk_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_topk_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_topk_init_signature() { diff --git a/lib/local-execution/src/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc index 5c3c1dd1ca..099206e372 100644 --- a/lib/local-execution/src/ops/transpose.cc +++ b/lib/local-execution/src/ops/transpose.cc @@ -100,13 +100,13 @@ OpTaskInvocation backward(TransposeAttrs const &attrs) { } TaskImplFunction get_transpose_init_task_impl() { - return TaskImplFunction{InitTaskImplFunction{init_task_impl}}; + return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } TaskImplFunction get_transpose_fwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } TaskImplFunction get_transpose_bwd_task_impl() { - return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}}; + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } OpTaskSignature get_transpose_init_signature() { diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc index 1b1173c70e..485955a5dc 100644 --- a/lib/local-execution/src/optimizer.cc +++ b/lib/local-execution/src/optimizer.cc @@ -35,8 +35,9 @@ TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs, if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { b.bind_arg(HANDLE, ff_handle()); return {task_id_t::SGD_UPD_NCCL_TASK_ID, b}; + } else { + return {task_id_t::SGD_UPD_PS_TASK_ID, b}; } - return {task_id_t::SGD_UPD_PS_TASK_ID, b}; } static void sgd_update_task_impl(TaskArgumentAccessor const &acc) { @@ -183,8 +184,8 @@ TaskImplFunction get_adam_update_task_impl() { TaskSignature get_update_signature(OptimizerAttrs const &attrs) { return attrs.visit(overload{ - [&](SGDOptimizerAttrs const &s) { return get_sgd_update_signature(); }, - [&](AdamOptimizerAttrs const &s) { + [&](SGDOptimizerAttrs const &) { return get_sgd_update_signature(); }, + [&](AdamOptimizerAttrs const &) { return get_adam_update_signature(); }}); } @@ -192,21 +193,21 @@ TaskSignature get_update_signature(OptimizerAttrs const &attrs) { TaskInvocation get_update_invocation(OptimizerAttrs const &attrs, tensor_guid_t const &weight, - std::vector const &buffer_tensors) { + std::vector const &grad_buffer_tensors) { return attrs.visit( overload{[&](SGDOptimizerAttrs const &s) { - return sgd_update(s, weight, buffer_tensors.at(0)); + return sgd_update(s, weight, grad_buffer_tensors.at(0)); }, [&](AdamOptimizerAttrs const &s) { return adam_update( - s, weight, buffer_tensors.at(0), buffer_tensors.at(1)); + s, weight, grad_buffer_tensors.at(0), grad_buffer_tensors.at(1)); }}); } TaskImplFunction get_update_task_impl(OptimizerAttrs const &attrs) { return attrs.visit(overload{ - [&](SGDOptimizerAttrs const &s) { return get_sgd_update_task_impl(); }, - [&](AdamOptimizerAttrs const &s) { + [&](SGDOptimizerAttrs const &) { return get_sgd_update_task_impl(); }, + [&](AdamOptimizerAttrs const &) { return get_adam_update_task_impl(); }}); } diff --git a/lib/local-execution/src/task_invocation.cc b/lib/local-execution/src/task_invocation.cc index c64af5332e..e15b9ae4ef 100644 --- a/lib/local-execution/src/task_invocation.cc +++ b/lib/local-execution/src/task_invocation.cc @@ -42,8 +42,7 @@ std::unordered_map const & } bool is_invocation_valid(TaskSignature const &sig, TaskInvocation const &inv) { - // TODO: implement signature checking - return true; + NOT_IMPLEMENTED(); } } // namespace FlexFlow diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc index 15bf089b6b..740c2a7355 100644 --- a/lib/local-execution/test/src/test_loss_e2e.cc +++ b/lib/local-execution/test/src/test_loss_e2e.cc @@ -19,7 +19,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { EnableProfiling::NO, ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/0}}; - OptimizerAttrs optimizer_attrs = make_empty_sgd_attrs(); + OptimizerAttrs optimizer_attrs = OptimizerAttrs{SGDOptimizerAttrs{ + /*lr=*/0.0, + /*momentum=*/0.0, + /*nesterov=*/false, + /*weight_decay=*/0.0}}; // construct graph ComputationGraphBuilder cg_builder; @@ -76,7 +80,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { SUBCASE("LossFunction::CATEGORICAL_CROSSENTROPY") { std::optional model_training_instance = - ModelTrainingInstance{LossAttrs{OtherLossAttrs{ + ModelTrainingInstance{LossAttrs{NonconfigurableLossAttrs{ LossFunction::CATEGORICAL_CROSSENTROPY}}, label_tensor, logit_tensor, @@ -94,7 +98,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { SUBCASE("LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE") { std::optional model_training_instance = ModelTrainingInstance{ - LossAttrs{OtherLossAttrs{ + LossAttrs{NonconfigurableLossAttrs{ LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, label_tensor, logit_tensor, @@ -112,7 +116,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { SUBCASE("LossFunction::IDENTITY") { std::optional model_training_instance = ModelTrainingInstance{ - LossAttrs{OtherLossAttrs{LossFunction::IDENTITY}}, + LossAttrs{NonconfigurableLossAttrs{LossFunction::IDENTITY}}, label_tensor, logit_tensor, optimizer_attrs}; diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc index 3899f60b83..6ad59c8286 100644 --- a/lib/local-execution/test/src/test_update_e2e.cc +++ b/lib/local-execution/test/src/test_update_e2e.cc @@ -55,7 +55,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { /*weight_decay=*/0.001}}; std::optional model_training_instance = ModelTrainingInstance{ - LossAttrs{OtherLossAttrs{ + LossAttrs{NonconfigurableLossAttrs{ LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, label_tensor, logit_tensor, @@ -78,7 +78,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { /*weight_decay=*/0.001}}; std::optional model_training_instance = ModelTrainingInstance{ - LossAttrs{OtherLossAttrs{ + LossAttrs{NonconfigurableLossAttrs{ LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, label_tensor, logit_tensor, @@ -107,7 +107,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { std::optional model_training_instance = ModelTrainingInstance{ LossAttrs{ - OtherLossAttrs{LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, + NonconfigurableLossAttrs{LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, label_tensor, logit_tensor, optimizer_attrs}; diff --git a/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml b/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml index 8a4f38839c..d60c6507cf 100644 --- a/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml +++ b/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml @@ -10,7 +10,7 @@ features = [ includes = [ "op-attrs/ops/sparse_categorical_ce_loss_attrs.dtg.h", - "op-attrs/ops/other_loss_attrs.dtg.h" + "op-attrs/ops/nonconfigurable_loss_attrs.dtg.h" ] [[values]] @@ -18,5 +18,5 @@ type = "::FlexFlow::SparseCategoricalCrossEntropyLossAttrs" key = "sparse_categorical_ce_loss_attrs" [[values]] -type = "::FlexFlow::OtherLossAttrs" -key = "other_loss_attrs" +type = "::FlexFlow::NonconfigurableLossAttrs" +key = "nonconfigurable_loss_attrs" diff --git a/lib/op-attrs/include/op-attrs/ops/loss_functions.h b/lib/op-attrs/include/op-attrs/ops/loss_functions.h index 9fb0597197..74d2d0a479 100644 --- a/lib/op-attrs/include/op-attrs/ops/loss_functions.h +++ b/lib/op-attrs/include/op-attrs/ops/loss_functions.h @@ -1,11 +1,11 @@ #ifndef _FLEXFLOW_OP_ATTRS_INCLUDE_OP_ATTRS_OPS_LOSS_FUNCTIONS_H #define _FLEXFLOW_OP_ATTRS_INCLUDE_OP_ATTRS_OPS_LOSS_FUNCTIONS_H -#include "core.h" -#include "loss_attrs.dtg.h" -#include "loss_function.dtg.h" -#include "other_loss_attrs.dtg.h" -#include "sparse_categorical_ce_loss_attrs.dtg.h" +#include "op-attrs/ops/core.h" +#include "op-attrs/ops/loss_attrs.dtg.h" +#include "op-attrs/ops/loss_function.dtg.h" +#include "op-attrs/ops/nonconfigurable_loss_attrs.dtg.h" +#include "op-attrs/ops/sparse_categorical_ce_loss_attrs.dtg.h" namespace FlexFlow { diff --git a/lib/op-attrs/include/op-attrs/ops/other_loss_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/nonconfigurable_loss_attrs.struct.toml similarity index 86% rename from lib/op-attrs/include/op-attrs/ops/other_loss_attrs.struct.toml rename to lib/op-attrs/include/op-attrs/ops/nonconfigurable_loss_attrs.struct.toml index 81055f5835..0420e7ef7b 100644 --- a/lib/op-attrs/include/op-attrs/ops/other_loss_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/nonconfigurable_loss_attrs.struct.toml @@ -1,5 +1,5 @@ namespace = "FlexFlow" -name = "OtherLossAttrs" +name = "NonconfigurableLossAttrs" features = [ "eq", "ord", diff --git a/lib/op-attrs/src/loss_functions.cc b/lib/op-attrs/src/loss_functions.cc index cae88be453..50a26ec792 100644 --- a/lib/op-attrs/src/loss_functions.cc +++ b/lib/op-attrs/src/loss_functions.cc @@ -12,7 +12,7 @@ LossFunction get_loss_function(LossAttrs const &attrs) { overload{[&](SparseCategoricalCrossEntropyLossAttrs const &s) { return LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY; }, - [&](OtherLossAttrs const &s) { return s.loss_type; }}); + [&](NonconfigurableLossAttrs const &s) { return s.loss_type; }}); } LossFunction parse_loss_name(std::string const &raw_name) { diff --git a/lib/pcg/include/pcg/optimizer_attrs.h b/lib/pcg/include/pcg/optimizer_attrs.h deleted file mode 100644 index 550bf12cc8..0000000000 --- a/lib/pcg/include/pcg/optimizer_attrs.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef _FLEXFLOW_LIB_PCG_INCLUDE_PCG_OPTIMIZER_ATTRS_H -#define _FLEXFLOW_LIB_PCG_INCLUDE_PCG_OPTIMIZER_ATTRS_H - -#include "pcg/optimizer_attrs.dtg.h" - -namespace FlexFlow { - -OptimizerAttrs make_empty_sgd_attrs(); -OptimizerAttrs make_empty_adam_attrs(); - -} // namespace FlexFlow - -#endif diff --git a/lib/pcg/src/pcg/optimizer_attrs.cc b/lib/pcg/src/pcg/optimizer_attrs.cc deleted file mode 100644 index d51070b10d..0000000000 --- a/lib/pcg/src/pcg/optimizer_attrs.cc +++ /dev/null @@ -1,14 +0,0 @@ -#include "pcg/optimizer_attrs.h" - -namespace FlexFlow { - -OptimizerAttrs make_empty_sgd_attrs() { - return OptimizerAttrs{SGDOptimizerAttrs{0.0, 0.0, false, 0.0}}; -} - -OptimizerAttrs make_empty_adam_attrs() { - return OptimizerAttrs{ - AdamOptimizerAttrs{0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}}; -} - -} // namespace FlexFlow From 103ef073a4eedd0108ac6537541d5e4d2f6a03d9 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 11 Sep 2024 12:59:33 -0700 Subject: [PATCH 11/91] Format --- .../include/local-execution/optimizer.h | 7 ++-- .../include/local-execution/task_signature.h | 4 +-- .../src/init_op_task_impl_function.cc | 18 ++++++---- .../src/local_slots_backing.cc | 35 +++++++++---------- lib/local-execution/src/optimizer.cc | 34 ++++++++---------- lib/local-execution/test/src/test_loss_e2e.cc | 10 +++--- .../test/src/test_update_e2e.cc | 4 +-- 7 files changed, 57 insertions(+), 55 deletions(-) diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h index e1f11b8a68..a6395a4daa 100644 --- a/lib/local-execution/include/local-execution/optimizer.h +++ b/lib/local-execution/include/local-execution/optimizer.h @@ -11,9 +11,10 @@ namespace FlexFlow { TaskSignature get_update_signature(OptimizerAttrs const &); -TaskInvocation get_update_invocation(OptimizerAttrs const &, - tensor_guid_t const &weight, - std::vector const &grad_buffer_tensors); +TaskInvocation get_update_invocation( + OptimizerAttrs const &, + tensor_guid_t const &weight, + std::vector const &grad_buffer_tensors); TaskImplFunction get_update_task_impl(OptimizerAttrs const &); TaskSignature get_sgd_update_signature(); diff --git a/lib/local-execution/include/local-execution/task_signature.h b/lib/local-execution/include/local-execution/task_signature.h index ed28f8eaea..6da69f2441 100644 --- a/lib/local-execution/include/local-execution/task_signature.h +++ b/lib/local-execution/include/local-execution/task_signature.h @@ -35,7 +35,7 @@ void add_return_value(TaskSignature &task_signature) { /** * @brief Adds an argument slot without checking if it is serializable. - * + * * This function is used for arguments that are device-specific. */ @@ -46,7 +46,7 @@ void add_unchecked_arg_slot(TaskSignature &task_signature, int name) { /** * @brief Adds an argument slot without checking if it is serializable. - * + * * This function is used for arguments that are device-specific. */ diff --git a/lib/local-execution/src/init_op_task_impl_function.cc b/lib/local-execution/src/init_op_task_impl_function.cc index 1c946982f5..abe84b828e 100644 --- a/lib/local-execution/src/init_op_task_impl_function.cc +++ b/lib/local-execution/src/init_op_task_impl_function.cc @@ -2,27 +2,33 @@ namespace FlexFlow { -bool InitOpTaskImplFunction::operator==(InitOpTaskImplFunction const &other) const { +bool InitOpTaskImplFunction::operator==( + InitOpTaskImplFunction const &other) const { return this->function_ptr == other.function_ptr; } -bool InitOpTaskImplFunction::operator!=(InitOpTaskImplFunction const &other) const { +bool InitOpTaskImplFunction::operator!=( + InitOpTaskImplFunction const &other) const { return this->function_ptr != other.function_ptr; } -bool InitOpTaskImplFunction::operator<(InitOpTaskImplFunction const &other) const { +bool InitOpTaskImplFunction::operator<( + InitOpTaskImplFunction const &other) const { return this->function_ptr < other.function_ptr; } -bool InitOpTaskImplFunction::operator>(InitOpTaskImplFunction const &other) const { +bool InitOpTaskImplFunction::operator>( + InitOpTaskImplFunction const &other) const { return this->function_ptr > other.function_ptr; } -bool InitOpTaskImplFunction::operator<=(InitOpTaskImplFunction const &other) const { +bool InitOpTaskImplFunction::operator<=( + InitOpTaskImplFunction const &other) const { return this->function_ptr <= other.function_ptr; } -bool InitOpTaskImplFunction::operator>=(InitOpTaskImplFunction const &other) const { +bool InitOpTaskImplFunction::operator>=( + InitOpTaskImplFunction const &other) const { return this->function_ptr >= other.function_ptr; } diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc index 0a1497b6c8..7050063254 100644 --- a/lib/local-execution/src/local_slots_backing.cc +++ b/lib/local-execution/src/local_slots_backing.cc @@ -154,29 +154,28 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing( OpTaskBinding const &binding, layer_guid_t const &op_guid) const { - return map_values(binding.get_arg_bindings(), [&](auto const &arg_binding){ - return arg_binding.template visit(overload{ - [&](OpArgRefSpec const &s) { - return this->resolve_op_arg_ref_spec(s, op_guid); - }, - [&](RuntimeArgRefSpec const &s) { - return this->resolve_runtime_arg_ref_spec(s); - }, - [](ConcreteArgSpec const &s) { return s; } - }); + return map_values(binding.get_arg_bindings(), [&](auto const &arg_binding) { + return arg_binding.template visit( + overload{[&](OpArgRefSpec const &s) { + return this->resolve_op_arg_ref_spec(s, op_guid); + }, + [&](RuntimeArgRefSpec const &s) { + return this->resolve_runtime_arg_ref_spec(s); + }, + [](ConcreteArgSpec const &s) { return s; }}); }); } ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing( TaskBinding const &binding) const { - return map_values(binding.get_arg_bindings(), [&](auto const &arg_binding){ - return arg_binding.template visit(overload{ - [&](RuntimeArgRefSpec const &s) { - return this->resolve_runtime_arg_ref_spec(s); - }, - [](ConcreteArgSpec const &s) { return s; } - }); - });; + return map_values(binding.get_arg_bindings(), [&](auto const &arg_binding) { + return arg_binding.template visit( + overload{[&](RuntimeArgRefSpec const &s) { + return this->resolve_runtime_arg_ref_spec(s); + }, + [](ConcreteArgSpec const &s) { return s; }}); + }); + ; } ConcreteArgSpec LocalSlotsBacking::resolve_op_arg_ref_spec( diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc index 485955a5dc..29beb15edf 100644 --- a/lib/local-execution/src/optimizer.cc +++ b/lib/local-execution/src/optimizer.cc @@ -35,7 +35,7 @@ TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs, if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { b.bind_arg(HANDLE, ff_handle()); return {task_id_t::SGD_UPD_NCCL_TASK_ID, b}; - } else { + } else { return {task_id_t::SGD_UPD_PS_TASK_ID, b}; } } @@ -185,31 +185,27 @@ TaskImplFunction get_adam_update_task_impl() { TaskSignature get_update_signature(OptimizerAttrs const &attrs) { return attrs.visit(overload{ [&](SGDOptimizerAttrs const &) { return get_sgd_update_signature(); }, - [&](AdamOptimizerAttrs const &) { - return get_adam_update_signature(); - }}); + [&](AdamOptimizerAttrs const &) { return get_adam_update_signature(); }}); } -TaskInvocation - get_update_invocation(OptimizerAttrs const &attrs, - tensor_guid_t const &weight, - std::vector const &grad_buffer_tensors) { - return attrs.visit( - overload{[&](SGDOptimizerAttrs const &s) { - return sgd_update(s, weight, grad_buffer_tensors.at(0)); - }, - [&](AdamOptimizerAttrs const &s) { - return adam_update( - s, weight, grad_buffer_tensors.at(0), grad_buffer_tensors.at(1)); - }}); +TaskInvocation get_update_invocation( + OptimizerAttrs const &attrs, + tensor_guid_t const &weight, + std::vector const &grad_buffer_tensors) { + return attrs.visit(overload{ + [&](SGDOptimizerAttrs const &s) { + return sgd_update(s, weight, grad_buffer_tensors.at(0)); + }, + [&](AdamOptimizerAttrs const &s) { + return adam_update( + s, weight, grad_buffer_tensors.at(0), grad_buffer_tensors.at(1)); + }}); } TaskImplFunction get_update_task_impl(OptimizerAttrs const &attrs) { return attrs.visit(overload{ [&](SGDOptimizerAttrs const &) { return get_sgd_update_task_impl(); }, - [&](AdamOptimizerAttrs const &) { - return get_adam_update_task_impl(); - }}); + [&](AdamOptimizerAttrs const &) { return get_adam_update_task_impl(); }}); } } // namespace FlexFlow diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc index 740c2a7355..6cc66032ff 100644 --- a/lib/local-execution/test/src/test_loss_e2e.cc +++ b/lib/local-execution/test/src/test_loss_e2e.cc @@ -19,11 +19,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { EnableProfiling::NO, ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/0}}; - OptimizerAttrs optimizer_attrs = OptimizerAttrs{SGDOptimizerAttrs{ - /*lr=*/0.0, - /*momentum=*/0.0, - /*nesterov=*/false, - /*weight_decay=*/0.0}}; + OptimizerAttrs optimizer_attrs = + OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.0, + /*momentum=*/0.0, + /*nesterov=*/false, + /*weight_decay=*/0.0}}; // construct graph ComputationGraphBuilder cg_builder; diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc index 6ad59c8286..f300fe0720 100644 --- a/lib/local-execution/test/src/test_update_e2e.cc +++ b/lib/local-execution/test/src/test_update_e2e.cc @@ -106,8 +106,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { /*epsilon=*/1e-8}}; std::optional model_training_instance = ModelTrainingInstance{ - LossAttrs{ - NonconfigurableLossAttrs{LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, + LossAttrs{NonconfigurableLossAttrs{ + LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, label_tensor, logit_tensor, optimizer_attrs}; From f48f9ff97022910e69e0711b3cc0155db23da5bb Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 17 Sep 2024 17:43:22 -0700 Subject: [PATCH 12/91] Fix test and small issues --- lib/kernels/include/kernels/array_shape.h | 1 + lib/kernels/include/kernels/profiling.h | 1 + lib/kernels/src/array_shape.cc | 28 +++----- .../local-execution/local_slots_backing.h | 2 +- .../include/local-execution/loss_functions.h | 2 +- .../include/local-execution/optimizer.h | 2 +- .../include/local-execution/task_binding.h | 58 +++++++++++++++++ .../include/local-execution/task_invocation.h | 65 +------------------ .../task_invocation.struct.toml | 19 ++++++ .../src/local_slots_backing.cc | 4 +- .../src/local_training_backing.cc | 5 +- lib/local-execution/src/loss_functions.cc | 6 +- lib/local-execution/src/optimizer.cc | 9 +-- lib/local-execution/src/task_binding.cc | 44 +++++++++++++ lib/local-execution/src/task_invocation.cc | 39 ----------- lib/local-execution/test/src/test_loss_e2e.cc | 6 +- .../test/src/test_update_e2e.cc | 6 +- 17 files changed, 157 insertions(+), 140 deletions(-) create mode 100644 lib/local-execution/include/local-execution/task_binding.h create mode 100644 lib/local-execution/include/local-execution/task_invocation.struct.toml create mode 100644 lib/local-execution/src/task_binding.cc diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h index 6b0b57b57f..015cacc7cb 100644 --- a/lib/kernels/include/kernels/array_shape.h +++ b/lib/kernels/include/kernels/array_shape.h @@ -17,6 +17,7 @@ struct ArrayShape { ArrayShape(size_t *dims, size_t num_dims); ArrayShape(TensorShape const &shape); ArrayShape(std::vector const &); + ArrayShape(LegionTensorDims const &); /** * @brief Alias of ArrayShape::num_elements for compatibility with diff --git a/lib/kernels/include/kernels/profiling.h b/lib/kernels/include/kernels/profiling.h index 655d540685..31c70010a0 100644 --- a/lib/kernels/include/kernels/profiling.h +++ b/lib/kernels/include/kernels/profiling.h @@ -40,6 +40,7 @@ std::optional profiling_wrapper(F const &f, } float elapsed = 0; + std::cout << "hello"; checkCUDA(ffEventRecord(t_end, stream)); checkCUDA(ffEventSynchronize(t_end)); checkCUDA(ffEventElapsedTime(&elapsed, t_start, t_end)); diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index 054e16e90a..8464212290 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -1,4 +1,5 @@ #include "kernels/array_shape.h" +#include "op-attrs/dim_ordered/slice.h" #include "utils/containers/product.h" namespace FlexFlow { @@ -19,6 +20,9 @@ ArrayShape::ArrayShape(TensorShape const &shape) ArrayShape::ArrayShape(std::vector const &input_dims) : dims(input_dims) {} +ArrayShape::ArrayShape(LegionTensorDims const &legion_tensor_dims) + : dims(legion_tensor_dims) {} + std::size_t ArrayShape::get_volume() const { return this->num_elements(); } @@ -51,33 +55,19 @@ std::size_t ArrayShape::at(ff_dim_t idx) const { } ArrayShape ArrayShape::sub_shape(legion_dim_t start, ff_dim_t end) const { - NOT_IMPLEMENTED(); + legion_dim_t legion_end = legion_dim_from_ff_dim(end, num_dims()); + return this->sub_shape(start, legion_end); } ArrayShape ArrayShape::sub_shape(std::optional start, std::optional end) const { - std::vector new_shape; - ff_dim_t start_idx = start.value_or(ff_dim_t{0}); - ff_dim_t end_idx = end.value_or(ff_dim_t{this->num_dims()}); - - while (start_idx < end_idx) { - new_shape.push_back(this->at(start_idx)); - start_idx = ff_dim_t{start_idx.value + 1}; - } - return ArrayShape{new_shape}; + return ArrayShape{legion_dims_from_ff_dims( + slice(ff_ordered_from_legion_ordered(this->dims), start, end))}; } ArrayShape ArrayShape::sub_shape(std::optional start, std::optional end) const { - std::vector new_shape; - legion_dim_t start_idx = start.value_or(legion_dim_t{0}); - legion_dim_t end_idx = end.value_or(legion_dim_t{this->num_dims()}); - - while (start_idx < end_idx) { - new_shape.push_back(this->at(start_idx)); - start_idx = add_to_legion_dim(start_idx, 1); - } - return ArrayShape{new_shape}; + return ArrayShape{slice(this->dims, start, end)}; } std::optional ArrayShape::at_maybe(legion_dim_t index) const { diff --git a/lib/local-execution/include/local-execution/local_slots_backing.h b/lib/local-execution/include/local-execution/local_slots_backing.h index 439113c873..678be4c96b 100644 --- a/lib/local-execution/include/local-execution/local_slots_backing.h +++ b/lib/local-execution/include/local-execution/local_slots_backing.h @@ -7,7 +7,7 @@ #include "local-execution/op_task_invocation.h" #include "local-execution/per_device_op_state.h" #include "local-execution/runtime_arg_config.h" -#include "local-execution/task_invocation.h" +#include "local-execution/task_invocation.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/loss_functions.h b/lib/local-execution/include/local-execution/loss_functions.h index 58405536d8..2298115d5d 100644 --- a/lib/local-execution/include/local-execution/loss_functions.h +++ b/lib/local-execution/include/local-execution/loss_functions.h @@ -17,7 +17,7 @@ #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_ #include "local-execution/task_impl_function.dtg.h" -#include "local-execution/task_invocation.h" +#include "local-execution/task_invocation.dtg.h" #include "local-execution/task_signature.h" #include "op-attrs/ops/loss_functions.h" diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h index a6395a4daa..1e2cd65362 100644 --- a/lib/local-execution/include/local-execution/optimizer.h +++ b/lib/local-execution/include/local-execution/optimizer.h @@ -2,7 +2,7 @@ #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_ #include "local-execution/task_impl_function.dtg.h" -#include "local-execution/task_invocation.h" +#include "local-execution/task_invocation.dtg.h" #include "local-execution/task_signature.h" #include "pcg/optimizer_attrs.dtg.h" #include "pcg/optimizers/adam_optimizer_attrs.dtg.h" diff --git a/lib/local-execution/include/local-execution/task_binding.h b/lib/local-execution/include/local-execution/task_binding.h new file mode 100644 index 0000000000..cbe210f438 --- /dev/null +++ b/lib/local-execution/include/local-execution/task_binding.h @@ -0,0 +1,58 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_BINDING_H +#define _FLEXFLOW_LOCAL_EXECUTION_TASK_BINDING_H + +#include "local-execution/slot_grad_id.dtg.h" +#include "local-execution/slot_id_t.dtg.h" +#include "local-execution/task_arg_spec.dtg.h" +#include "local-execution/task_id_t.dtg.h" +#include "local-execution/task_signature.dtg.h" +#include "local-execution/tensor_guid_spec.dtg.h" + +namespace FlexFlow { + +struct TaskBinding { + TaskBinding() = default; + + void bind(int, TensorGuidSpec const &); + void bind(slot_id_t, TensorGuidSpec const &); + + template + void bind_arg(int name, T const &t) { + this->bind_arg(slot_id_t{name}, t); + } + + template + void bind_arg(slot_id_t name, T const &t) { + this->insert_arg_spec(name, TaskArgSpec{ConcreteArgSpec::create(t)}); + } + + template + void bind_arg(int name, RuntimeArgRef const &t) { + this->bind_arg(slot_id_t{name}, t); + } + + template + void bind_arg(slot_id_t name, RuntimeArgRef const &ref) { + this->insert_arg_spec(name, TaskArgSpec{RuntimeArgRefSpec::create(ref)}); + } + + bool operator==(TaskBinding const &other) const; + bool operator!=(TaskBinding const &other) const; + + std::unordered_map const & + get_tensor_bindings() const; + std::unordered_map const &get_arg_bindings() const; + +private: + std::unordered_map tensor_bindings; + std::unordered_map arg_bindings; + +private: + void insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec); + std::tuple + tie() const; +}; + +} // namespace FlexFlow + +#endif diff --git a/lib/local-execution/include/local-execution/task_invocation.h b/lib/local-execution/include/local-execution/task_invocation.h index 2317c65c02..93b5743a80 100644 --- a/lib/local-execution/include/local-execution/task_invocation.h +++ b/lib/local-execution/include/local-execution/task_invocation.h @@ -1,71 +1,12 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_INVOCATION_H #define _FLEXFLOW_LOCAL_EXECUTION_TASK_INVOCATION_H -#include "local-execution/slot_grad_id.dtg.h" -#include "local-execution/slot_id_t.dtg.h" -#include "local-execution/task_arg_spec.dtg.h" -#include "local-execution/task_id_t.dtg.h" -#include "local-execution/task_signature.dtg.h" -#include "local-execution/tensor_guid_spec.dtg.h" +#include "local-execution/task_invocation.dtg.h" namespace FlexFlow { -struct TaskBinding { - TaskBinding() = default; - - void bind(int, TensorGuidSpec const &); - void bind(slot_id_t, TensorGuidSpec const &); - - template - void bind_arg(int name, T const &t) { - this->bind_arg(slot_id_t{name}, t); - } - - template - void bind_arg(slot_id_t name, T const &t) { - this->insert_arg_spec(name, TaskArgSpec{ConcreteArgSpec::create(t)}); - } - - template - void bind_arg(int name, RuntimeArgRef const &t) { - this->bind_arg(slot_id_t{name}, t); - } - - template - void bind_arg(slot_id_t name, RuntimeArgRef const &ref) { - this->insert_arg_spec(name, TaskArgSpec{RuntimeArgRefSpec::create(ref)}); - } - - bool operator==(TaskBinding const &other) const; - bool operator!=(TaskBinding const &other) const; - - std::unordered_map const & - get_tensor_bindings() const; - std::unordered_map const &get_arg_bindings() const; - -private: - std::unordered_map tensor_bindings; - std::unordered_map arg_bindings; - -private: - void insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec); - std::tuple - tie() const; -}; - -struct TaskInvocation { -public: - TaskInvocation() = delete; - TaskInvocation(task_id_t task_id, TaskBinding const &binding) - : task_id(task_id), binding(binding) {} - -public: - task_id_t task_id; - TaskBinding binding; -}; - bool is_invocation_valid(TaskSignature const &sig, TaskInvocation const &inv); - -} // namespace FlexFlow + +} #endif diff --git a/lib/local-execution/include/local-execution/task_invocation.struct.toml b/lib/local-execution/include/local-execution/task_invocation.struct.toml new file mode 100644 index 0000000000..abcaabda93 --- /dev/null +++ b/lib/local-execution/include/local-execution/task_invocation.struct.toml @@ -0,0 +1,19 @@ +namespace = "FlexFlow" +name = "TaskInvocation" +features = [ + "eq" +] + +includes = [ + "local-execution/task_binding.h", + "local-execution/task_id_t.dtg.h" +] + + +[[fields]] +name = "task_id" +type = "::FlexFlow::task_id_t" + +[[fields]] +name = "binding" +type = "::FlexFlow::TaskBinding" diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc index 7050063254..194d64c34b 100644 --- a/lib/local-execution/src/local_slots_backing.cc +++ b/lib/local-execution/src/local_slots_backing.cc @@ -154,7 +154,7 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing( OpTaskBinding const &binding, layer_guid_t const &op_guid) const { - return map_values(binding.get_arg_bindings(), [&](auto const &arg_binding) { + return map_values(binding.get_arg_bindings(), [&](OpArgSpec const &arg_binding) { return arg_binding.template visit( overload{[&](OpArgRefSpec const &s) { return this->resolve_op_arg_ref_spec(s, op_guid); @@ -168,7 +168,7 @@ ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing( ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing( TaskBinding const &binding) const { - return map_values(binding.get_arg_bindings(), [&](auto const &arg_binding) { + return map_values(binding.get_arg_bindings(), [&](TaskArgSpec const &arg_binding) { return arg_binding.template visit( overload{[&](RuntimeArgRefSpec const &s) { return this->resolve_runtime_arg_ref_spec(s); diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index dff33826b9..7f0b179390 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -3,6 +3,7 @@ #include "local-execution/model_training_instance.h" #include "local-execution/optimizer.h" #include "local-execution/task_signature_impl.h" +#include "local-execution/task_invocation.h" #include "utils/containers/contains.h" #include "utils/containers/contains_key.h" #include "utils/containers/get_only.h" @@ -124,7 +125,7 @@ PerLayerElapsedTime LocalTrainingBacking::execute_backward() { backward(unwrapped_training_instance.loss_attrs, unwrapped_training_instance.logit_tensor, unwrapped_training_instance.label_tensor); - assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation)); + // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation)); TaskArgumentAccessor loss_accessor = this->get_task_arg_accessor(loss_invocation); TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl(); @@ -167,7 +168,7 @@ void LocalTrainingBacking::execute_update() { // get invocation TaskInvocation invocation = get_update_invocation(attrs, weight_tensor, grad_buffer_tensors); - assert(is_invocation_valid(get_update_signature(attrs), invocation)); + // assert(is_invocation_valid(get_update_signature(attrs), invocation)); // execute update TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation); diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc index 771d175a7d..3a4c616377 100644 --- a/lib/local-execution/src/loss_functions.cc +++ b/lib/local-execution/src/loss_functions.cc @@ -41,7 +41,7 @@ TaskInvocation b.bind_arg(ATTRS, attrs); b.bind_arg(PROFILING, profiling_settings()); - return {task_id_t::LOSS_BWD_TASK_ID, b}; + return TaskInvocation{task_id_t::LOSS_BWD_TASK_ID, b}; } static void backward_task_impl(TaskArgumentAccessor const &acc) { @@ -51,7 +51,7 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { auto logit = acc.get_tensor(LOGIT); auto label = acc.get_tensor(LABEL); int batch_size = logit.shape.at(legion_dim_t{1}); - // assuming logit shape is [parallel dim(?), batch dim, num classes] + // assuming logit shape is [batch dim, num classes] LossFunction loss_type = get_loss_function(attrs); float scale_factor = 1.0f / batch_size; @@ -61,7 +61,7 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { } if (loss_type == LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY) { - // label shape is [parallel dim(?), batch dim, 1] + // label shape is [batch dim, 1] auto scce_attrs = attrs.get(); size_t ndim = logit.shape.num_dims(); int num_classes = logit.shape.at(legion_dim_t{0}); diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc index 29beb15edf..30f20bf8ec 100644 --- a/lib/local-execution/src/optimizer.cc +++ b/lib/local-execution/src/optimizer.cc @@ -34,9 +34,9 @@ TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs, if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { b.bind_arg(HANDLE, ff_handle()); - return {task_id_t::SGD_UPD_NCCL_TASK_ID, b}; + return TaskInvocation{task_id_t::SGD_UPD_NCCL_TASK_ID, b}; } else { - return {task_id_t::SGD_UPD_PS_TASK_ID, b}; + return TaskInvocation{task_id_t::SGD_UPD_PS_TASK_ID, b}; } } @@ -123,9 +123,10 @@ TaskInvocation adam_update(AdamOptimizerAttrs const &attrs, if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { b.bind_arg(HANDLE, ff_handle()); - return {task_id_t::ADAM_UPD_NCCL_TASK_ID, b}; + return TaskInvocation{task_id_t::ADAM_UPD_NCCL_TASK_ID, b}; + } else { + return TaskInvocation{task_id_t::ADAM_UPD_PS_TASK_ID, b}; } - return {task_id_t::ADAM_UPD_PS_TASK_ID, b}; } static void adam_update_task_impl(TaskArgumentAccessor const &acc) { diff --git a/lib/local-execution/src/task_binding.cc b/lib/local-execution/src/task_binding.cc new file mode 100644 index 0000000000..a5a3b2dc34 --- /dev/null +++ b/lib/local-execution/src/task_binding.cc @@ -0,0 +1,44 @@ +#include "local-execution/task_binding.h" +#include "utils/containers/contains_key.h" + +namespace FlexFlow { + +void TaskBinding::bind(int name, TensorGuidSpec const &tensor_guid_spec) { + this->bind(slot_id_t{name}, tensor_guid_spec); +} + +void TaskBinding::bind(slot_id_t name, TensorGuidSpec const &tensor_guid_spec) { + this->tensor_bindings.insert( + {SlotGradId{name, tensor_guid_spec.is_grad}, tensor_guid_spec}); +} + +void TaskBinding::insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec) { + assert(!contains_key(this->arg_bindings, name)); + this->arg_bindings.insert({name, arg_spec}); +} + +bool TaskBinding::operator==(TaskBinding const &other) const { + return this->tie() == other.tie(); +} + +bool TaskBinding::operator!=(TaskBinding const &other) const { + return this->tie() != other.tie(); +} + +std::tuple const &, + std::unordered_map const &> + TaskBinding::tie() const { + return std::tie(this->tensor_bindings, this->arg_bindings); +} + +std::unordered_map const & + TaskBinding::get_tensor_bindings() const { + return this->tensor_bindings; +} + +std::unordered_map const & + TaskBinding::get_arg_bindings() const { + return this->arg_bindings; +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/task_invocation.cc b/lib/local-execution/src/task_invocation.cc index e15b9ae4ef..e08c1036da 100644 --- a/lib/local-execution/src/task_invocation.cc +++ b/lib/local-execution/src/task_invocation.cc @@ -1,46 +1,7 @@ #include "local-execution/task_invocation.h" -#include "utils/containers/contains_key.h" namespace FlexFlow { -void TaskBinding::bind(int name, TensorGuidSpec const &tensor_guid_spec) { - this->bind(slot_id_t{name}, tensor_guid_spec); -} - -void TaskBinding::bind(slot_id_t name, TensorGuidSpec const &tensor_guid_spec) { - this->tensor_bindings.insert( - {SlotGradId{name, tensor_guid_spec.is_grad}, tensor_guid_spec}); -} - -void TaskBinding::insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec) { - assert(!contains_key(this->arg_bindings, name)); - this->arg_bindings.insert({name, arg_spec}); -} - -bool TaskBinding::operator==(TaskBinding const &other) const { - return this->tie() == other.tie(); -} - -bool TaskBinding::operator!=(TaskBinding const &other) const { - return this->tie() != other.tie(); -} - -std::tuple const &, - std::unordered_map const &> - TaskBinding::tie() const { - return std::tie(this->tensor_bindings, this->arg_bindings); -} - -std::unordered_map const & - TaskBinding::get_tensor_bindings() const { - return this->tensor_bindings; -} - -std::unordered_map const & - TaskBinding::get_arg_bindings() const { - return this->arg_bindings; -} - bool is_invocation_valid(TaskSignature const &sig, TaskInvocation const &inv) { NOT_IMPLEMENTED(); } diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc index 6cc66032ff..3bc85354a0 100644 --- a/lib/local-execution/test/src/test_loss_e2e.cc +++ b/lib/local-execution/test/src/test_loss_e2e.cc @@ -4,7 +4,7 @@ #include "kernels/managed_per_device_ff_handle.h" #include "local-execution/local_training_backing.h" #include "pcg/computation_graph_builder.h" -#include "pcg/optimizer_attrs.h" +#include "pcg/optimizer_attrs.dtg.h" #include "test_utils.h" namespace FlexFlow { @@ -16,8 +16,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ DeviceSpecific::create(managed_handle.raw_handle()), - EnableProfiling::NO, - ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/0}}; + EnableProfiling::YES, + ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}}; OptimizerAttrs optimizer_attrs = OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.0, diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc index f300fe0720..b5a503f430 100644 --- a/lib/local-execution/test/src/test_update_e2e.cc +++ b/lib/local-execution/test/src/test_update_e2e.cc @@ -4,7 +4,7 @@ #include "kernels/managed_per_device_ff_handle.h" #include "local-execution/local_training_backing.h" #include "pcg/computation_graph_builder.h" -#include "pcg/optimizer_attrs.h" +#include "pcg/optimizer_attrs.dtg.h" #include "test_utils.h" namespace FlexFlow { @@ -16,8 +16,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ DeviceSpecific::create(managed_handle.raw_handle()), - EnableProfiling::NO, - ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/0}}; + EnableProfiling::YES, + ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}}; // construct graph ComputationGraphBuilder cg_builder; From 189c9c8c034143cd4a5fc4bab0db652444601915 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 17 Sep 2024 17:43:37 -0700 Subject: [PATCH 13/91] Format --- .../include/local-execution/task_invocation.h | 2 +- .../src/local_slots_backing.cc | 36 ++++++++++--------- .../src/local_training_backing.cc | 2 +- 3 files changed, 21 insertions(+), 19 deletions(-) diff --git a/lib/local-execution/include/local-execution/task_invocation.h b/lib/local-execution/include/local-execution/task_invocation.h index 93b5743a80..d03d6ac8e1 100644 --- a/lib/local-execution/include/local-execution/task_invocation.h +++ b/lib/local-execution/include/local-execution/task_invocation.h @@ -6,7 +6,7 @@ namespace FlexFlow { bool is_invocation_valid(TaskSignature const &sig, TaskInvocation const &inv); - + } #endif diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc index 194d64c34b..ff23c269e7 100644 --- a/lib/local-execution/src/local_slots_backing.cc +++ b/lib/local-execution/src/local_slots_backing.cc @@ -154,27 +154,29 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing( OpTaskBinding const &binding, layer_guid_t const &op_guid) const { - return map_values(binding.get_arg_bindings(), [&](OpArgSpec const &arg_binding) { - return arg_binding.template visit( - overload{[&](OpArgRefSpec const &s) { - return this->resolve_op_arg_ref_spec(s, op_guid); - }, - [&](RuntimeArgRefSpec const &s) { - return this->resolve_runtime_arg_ref_spec(s); - }, - [](ConcreteArgSpec const &s) { return s; }}); - }); + return map_values( + binding.get_arg_bindings(), [&](OpArgSpec const &arg_binding) { + return arg_binding.template visit( + overload{[&](OpArgRefSpec const &s) { + return this->resolve_op_arg_ref_spec(s, op_guid); + }, + [&](RuntimeArgRefSpec const &s) { + return this->resolve_runtime_arg_ref_spec(s); + }, + [](ConcreteArgSpec const &s) { return s; }}); + }); } ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing( TaskBinding const &binding) const { - return map_values(binding.get_arg_bindings(), [&](TaskArgSpec const &arg_binding) { - return arg_binding.template visit( - overload{[&](RuntimeArgRefSpec const &s) { - return this->resolve_runtime_arg_ref_spec(s); - }, - [](ConcreteArgSpec const &s) { return s; }}); - }); + return map_values( + binding.get_arg_bindings(), [&](TaskArgSpec const &arg_binding) { + return arg_binding.template visit( + overload{[&](RuntimeArgRefSpec const &s) { + return this->resolve_runtime_arg_ref_spec(s); + }, + [](ConcreteArgSpec const &s) { return s; }}); + }); ; } diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index 7f0b179390..9c1136f198 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -2,8 +2,8 @@ #include "local-execution/loss_functions.h" #include "local-execution/model_training_instance.h" #include "local-execution/optimizer.h" -#include "local-execution/task_signature_impl.h" #include "local-execution/task_invocation.h" +#include "local-execution/task_signature_impl.h" #include "utils/containers/contains.h" #include "utils/containers/contains_key.h" #include "utils/containers/get_only.h" From b5647c8336848f0030445c9254cfc0e07b88ef4f Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 1 Oct 2024 09:17:46 -0700 Subject: [PATCH 14/91] Pass tests after merge --- lib/kernels/include/kernels/profiling.h | 1 - .../model_training_instance.struct.toml | 2 +- .../src/local_training_backing.cc | 2 +- .../test/src/test_local_cost_estimator.cc | 2 +- lib/local-execution/test/src/test_loss_e2e.cc | 6 ++--- .../test/src/test_update_e2e.cc | 4 ++-- .../op-attrs/ops/loss_attrs.variant.toml | 22 ------------------ .../op-attrs/ops/loss_function.enum.toml | 23 ------------------- .../include/op-attrs/ops/loss_functions.h | 8 +++---- .../loss_functions/loss_attrs.variant.toml | 6 ++--- .../ops/loss_functions/loss_functions.h | 20 ---------------- .../nonconfigurable_loss_attrs.struct.toml | 2 +- .../other_loss_attrs.struct.toml | 18 --------------- ...arse_categorical_ce_loss_attrs.struct.toml | 14 ----------- .../src/op-attrs/ops/loss_functions.cc | 2 +- 15 files changed, 17 insertions(+), 115 deletions(-) delete mode 100644 lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml delete mode 100644 lib/op-attrs/include/op-attrs/ops/loss_function.enum.toml delete mode 100644 lib/op-attrs/include/op-attrs/ops/loss_functions/loss_functions.h rename lib/op-attrs/include/op-attrs/ops/{ => loss_functions}/nonconfigurable_loss_attrs.struct.toml (80%) delete mode 100644 lib/op-attrs/include/op-attrs/ops/loss_functions/other_loss_attrs.struct.toml delete mode 100644 lib/op-attrs/include/op-attrs/ops/sparse_categorical_ce_loss_attrs.struct.toml diff --git a/lib/kernels/include/kernels/profiling.h b/lib/kernels/include/kernels/profiling.h index 31c70010a0..655d540685 100644 --- a/lib/kernels/include/kernels/profiling.h +++ b/lib/kernels/include/kernels/profiling.h @@ -40,7 +40,6 @@ std::optional profiling_wrapper(F const &f, } float elapsed = 0; - std::cout << "hello"; checkCUDA(ffEventRecord(t_end, stream)); checkCUDA(ffEventSynchronize(t_end)); checkCUDA(ffEventElapsedTime(&elapsed, t_start, t_end)); diff --git a/lib/local-execution/include/local-execution/model_training_instance.struct.toml b/lib/local-execution/include/local-execution/model_training_instance.struct.toml index b460d6bd3a..28282e21c0 100644 --- a/lib/local-execution/include/local-execution/model_training_instance.struct.toml +++ b/lib/local-execution/include/local-execution/model_training_instance.struct.toml @@ -8,7 +8,7 @@ features = [ ] includes = [ - "op-attrs/ops/loss_attrs.dtg.h", + "op-attrs/ops/loss_functions/loss_attrs.dtg.h", "pcg/tensor_guid_t.dtg.h", "pcg/optimizer_attrs.dtg.h", ] diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index b794cc6da6..edbb377047 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -4,10 +4,10 @@ #include "local-execution/optimizer.h" #include "local-execution/task_invocation.h" #include "local-execution/task_signature_impl.h" +#include "pcg/computation_graph.h" #include "utils/containers/contains.h" #include "utils/containers/contains_key.h" #include "utils/containers/get_only.h" -#include "pcg/computation_graph.h" #include "utils/containers/reversed.h" #include "utils/exception.h" diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc index 4c01df53e9..2b22d64969 100644 --- a/lib/local-execution/test/src/test_local_cost_estimator.cc +++ b/lib/local-execution/test/src/test_local_cost_estimator.cc @@ -31,7 +31,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { /*kdim=*/embed_dim, /*vdim=*/embed_dim, /*dropout=*/0.0, - /*bias=*/true, + /*bias=*/false, /*add_bias_kv=*/false, /*add_zero_attn=*/false, }; diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc index 3bc85354a0..4801aff6a9 100644 --- a/lib/local-execution/test/src/test_loss_e2e.cc +++ b/lib/local-execution/test/src/test_loss_e2e.cc @@ -33,7 +33,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { TensorShape input_shape = TensorShape{ TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; tensor_guid_t input_tensor = - cg_builder.create_tensor(input_shape, CreateGrad::YES); + cg_builder.create_input(input_shape, CreateGrad::YES); float scalar = 4.0; tensor_guid_t logit_tensor = @@ -50,7 +50,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { TensorShape label_shape = TensorShape{ TensorDims{FFOrdered{batch_size, 1}}, DataType::FLOAT}; tensor_guid_t label_tensor = - cg_builder.create_tensor(label_shape, CreateGrad::NO); + cg_builder.create_input(label_shape, CreateGrad::NO); GenericTensorAccessorW label_backing = allocator.allocate_tensor(label_shape); tensor_backing_map.insert({label_tensor, label_backing}); @@ -73,7 +73,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { SUBCASE("OtherAttrs") { tensor_guid_t label_tensor = - cg_builder.create_tensor(input_shape, CreateGrad::NO); + cg_builder.create_input(input_shape, CreateGrad::NO); GenericTensorAccessorW label_backing = allocator.allocate_tensor(input_shape); tensor_backing_map.insert({label_tensor, label_backing}); diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc index b5a503f430..af4303fab8 100644 --- a/lib/local-execution/test/src/test_update_e2e.cc +++ b/lib/local-execution/test/src/test_update_e2e.cc @@ -27,7 +27,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { TensorShape input_shape = TensorShape{ TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; tensor_guid_t input_tensor = - cg_builder.create_tensor(input_shape, CreateGrad::YES); + cg_builder.create_input(input_shape, CreateGrad::YES); float scalar = 4.0; tensor_guid_t logit_tensor = @@ -41,7 +41,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { tensor_backing_map.insert({input_tensor, input_backing}); tensor_guid_t label_tensor = - cg_builder.create_tensor(input_shape, CreateGrad::NO); + cg_builder.create_input(input_shape, CreateGrad::NO); GenericTensorAccessorW label_backing = allocator.allocate_tensor(input_shape); tensor_backing_map.insert({label_tensor, label_backing}); diff --git a/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml b/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml deleted file mode 100644 index d60c6507cf..0000000000 --- a/lib/op-attrs/include/op-attrs/ops/loss_attrs.variant.toml +++ /dev/null @@ -1,22 +0,0 @@ -namespace = "FlexFlow" -name = "LossAttrs" -features = [ - "eq", - "ord", - "hash", - "json", - "fmt", -] - -includes = [ - "op-attrs/ops/sparse_categorical_ce_loss_attrs.dtg.h", - "op-attrs/ops/nonconfigurable_loss_attrs.dtg.h" -] - -[[values]] -type = "::FlexFlow::SparseCategoricalCrossEntropyLossAttrs" -key = "sparse_categorical_ce_loss_attrs" - -[[values]] -type = "::FlexFlow::NonconfigurableLossAttrs" -key = "nonconfigurable_loss_attrs" diff --git a/lib/op-attrs/include/op-attrs/ops/loss_function.enum.toml b/lib/op-attrs/include/op-attrs/ops/loss_function.enum.toml deleted file mode 100644 index b9cd13eabf..0000000000 --- a/lib/op-attrs/include/op-attrs/ops/loss_function.enum.toml +++ /dev/null @@ -1,23 +0,0 @@ -namespace = "FlexFlow" -name = "LossFunction" -features = [ - "hash", - "json", - "rapidcheck", - "fmt", -] - -[[values]] -name = "CATEGORICAL_CROSSENTROPY" - -[[values]] -name = "SPARSE_CATEGORICAL_CROSSENTROPY" - -[[values]] -name = "MEAN_SQUARED_ERROR_AVG_REDUCE" - -[[values]] -name = "MEAN_SQUARED_ERROR_SUM_REDUCE" - -[[values]] -name = "IDENTITY" diff --git a/lib/op-attrs/include/op-attrs/ops/loss_functions.h b/lib/op-attrs/include/op-attrs/ops/loss_functions.h index 74d2d0a479..657f8d91dc 100644 --- a/lib/op-attrs/include/op-attrs/ops/loss_functions.h +++ b/lib/op-attrs/include/op-attrs/ops/loss_functions.h @@ -2,10 +2,10 @@ #define _FLEXFLOW_OP_ATTRS_INCLUDE_OP_ATTRS_OPS_LOSS_FUNCTIONS_H #include "op-attrs/ops/core.h" -#include "op-attrs/ops/loss_attrs.dtg.h" -#include "op-attrs/ops/loss_function.dtg.h" -#include "op-attrs/ops/nonconfigurable_loss_attrs.dtg.h" -#include "op-attrs/ops/sparse_categorical_ce_loss_attrs.dtg.h" +#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" +#include "op-attrs/ops/loss_functions/loss_function.dtg.h" +#include "op-attrs/ops/loss_functions/nonconfigurable_loss_attrs.dtg.h" +#include "op-attrs/ops/loss_functions/sparse_categorical_cross_entropy_loss_attrs.dtg.h" namespace FlexFlow { diff --git a/lib/op-attrs/include/op-attrs/ops/loss_functions/loss_attrs.variant.toml b/lib/op-attrs/include/op-attrs/ops/loss_functions/loss_attrs.variant.toml index 17293095e4..943760d949 100644 --- a/lib/op-attrs/include/op-attrs/ops/loss_functions/loss_attrs.variant.toml +++ b/lib/op-attrs/include/op-attrs/ops/loss_functions/loss_attrs.variant.toml @@ -11,7 +11,7 @@ features = [ includes = [ "op-attrs/ops/loss_functions/sparse_categorical_cross_entropy_loss_attrs.dtg.h", - "op-attrs/ops/loss_functions/other_loss_attrs.dtg.h", + "op-attrs/ops/loss_functions/nonconfigurable_loss_attrs.dtg.h", ] [[values]] @@ -19,5 +19,5 @@ type = "::FlexFlow::SparseCategoricalCrossEntropyLossAttrs" key = "sparse_categorical_cross_entropy_loss" [[values]] -type = "::FlexFlow::OtherLossAttrs" -key = "other_loss" +type = "::FlexFlow::NonconfigurableLossAttrs" +key = "nonconfigurable_loss_attrs" diff --git a/lib/op-attrs/include/op-attrs/ops/loss_functions/loss_functions.h b/lib/op-attrs/include/op-attrs/ops/loss_functions/loss_functions.h deleted file mode 100644 index ca8f3e6602..0000000000 --- a/lib/op-attrs/include/op-attrs/ops/loss_functions/loss_functions.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef _FLEXFLOW_OP_ATTRS_INCLUDE_OP_ATTRS_OPS_LOSS_FUNCTIONS_H -#define _FLEXFLOW_OP_ATTRS_INCLUDE_OP_ATTRS_OPS_LOSS_FUNCTIONS_H - -#include "op-attrs/ops/core.h" -#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" -#include "op-attrs/ops/loss_functions/loss_function.dtg.h" - -namespace FlexFlow { - -CHECK_VALID_OP_ATTR(LossAttrs); - -LossFunction parse_loss_function_name(std::string const &); - -LossFunction get_loss_function(OtherLossAttrs const &); -LossFunction get_loss_function(SparseCategoricalCrossEntropyLossAttrs const &); -LossFunction get_loss_function(LossAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/op-attrs/include/op-attrs/ops/nonconfigurable_loss_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/loss_functions/nonconfigurable_loss_attrs.struct.toml similarity index 80% rename from lib/op-attrs/include/op-attrs/ops/nonconfigurable_loss_attrs.struct.toml rename to lib/op-attrs/include/op-attrs/ops/loss_functions/nonconfigurable_loss_attrs.struct.toml index 0420e7ef7b..3fe7ac86c5 100644 --- a/lib/op-attrs/include/op-attrs/ops/nonconfigurable_loss_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/loss_functions/nonconfigurable_loss_attrs.struct.toml @@ -10,7 +10,7 @@ features = [ ] includes = [ - "op-attrs/ops/loss_function.dtg.h" + "op-attrs/ops/loss_functions/loss_function.dtg.h" ] [[fields]] diff --git a/lib/op-attrs/include/op-attrs/ops/loss_functions/other_loss_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/loss_functions/other_loss_attrs.struct.toml deleted file mode 100644 index 284a4b1d7d..0000000000 --- a/lib/op-attrs/include/op-attrs/ops/loss_functions/other_loss_attrs.struct.toml +++ /dev/null @@ -1,18 +0,0 @@ -namespace = "FlexFlow" -name = "OtherLossAttrs" -features = [ - "eq", - "ord", - "hash", - "fmt", - "rapidcheck", - "json", -] - -includes = [ - "op-attrs/ops/loss_functions/loss_function.dtg.h", -] - -[[fields]] -name = "loss_type" -type = "::FlexFlow::LossFunction" diff --git a/lib/op-attrs/include/op-attrs/ops/sparse_categorical_ce_loss_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/sparse_categorical_ce_loss_attrs.struct.toml deleted file mode 100644 index 21378a1154..0000000000 --- a/lib/op-attrs/include/op-attrs/ops/sparse_categorical_ce_loss_attrs.struct.toml +++ /dev/null @@ -1,14 +0,0 @@ -namespace = "FlexFlow" -name = "SparseCategoricalCrossEntropyLossAttrs" -features = [ - "eq", - "ord", - "hash", - "json", - "rapidcheck", - "fmt", -] - -[[fields]] -name = "replace_labels" -type = "bool" diff --git a/lib/op-attrs/src/op-attrs/ops/loss_functions.cc b/lib/op-attrs/src/op-attrs/ops/loss_functions.cc index a5c6aeb2a5..2b9a7533f0 100644 --- a/lib/op-attrs/src/op-attrs/ops/loss_functions.cc +++ b/lib/op-attrs/src/op-attrs/ops/loss_functions.cc @@ -1,4 +1,4 @@ -#include "op-attrs/ops/loss_functions/loss_functions.h" +#include "op-attrs/ops/loss_functions.h" #include "utils/containers/transform.h" #include "utils/exception.h" #include "utils/overload.h" From f5ff91e9757a73c94d73dddaec2243b0c46c87ec Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 1 Oct 2024 10:49:02 -0700 Subject: [PATCH 15/91] Fix input/weight differentiation --- .../local-execution/local_slots_backing.h | 2 ++ .../src/local_slots_backing.cc | 24 +++++++++---------- .../test/src/test_local_slots_backing.cc | 12 +++++++--- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/lib/local-execution/include/local-execution/local_slots_backing.h b/lib/local-execution/include/local-execution/local_slots_backing.h index 4c6dbacfe3..93c534e583 100644 --- a/lib/local-execution/include/local-execution/local_slots_backing.h +++ b/lib/local-execution/include/local-execution/local_slots_backing.h @@ -54,6 +54,8 @@ struct LocalSlotsBacking { TensorBackingMap gradient_tensor_mapping; std::unordered_map> input_tensor_slots; + std::unordered_map> + weight_tensor_slots; std::unordered_map> output_tensor_slots; std::unordered_map> diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc index 93cfe4498c..bdbfa4f222 100644 --- a/lib/local-execution/src/local_slots_backing.cc +++ b/lib/local-execution/src/local_slots_backing.cc @@ -22,8 +22,10 @@ void LocalSlotsBacking::allocate_outgoing_tensors( layer_guid_t const &layer_guid, ComputationGraph const &computation_graph, Allocator &allocator) { - std::vector incoming_tensors = - get_incoming_tensors(computation_graph, layer_guid); + std::vector incoming_input_tensors = + get_incoming_inputs(computation_graph, layer_guid); + std::vector incoming_weight_tensors = + get_incoming_weights(computation_graph, layer_guid); std::vector outgoing_tensors = get_outgoing_tensors(computation_graph, layer_guid); for (tensor_guid_t const &output_tensor : outgoing_tensors) { @@ -46,7 +48,8 @@ void LocalSlotsBacking::allocate_outgoing_tensors( } } - this->input_tensor_slots.insert({layer_guid, incoming_tensors}); + this->input_tensor_slots.insert({layer_guid, incoming_input_tensors}); + this->weight_tensor_slots.insert({layer_guid, incoming_weight_tensors}); this->output_tensor_slots.insert({layer_guid, outgoing_tensors}); } @@ -100,13 +103,6 @@ GenericTensorAccessorW const & TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( OpTaskBinding const &binding, layer_guid_t const &op_guid) const { TensorSlotsBacking mapping; - int num_inputs = 0; - for (auto const &tensor_binding : binding.get_tensor_bindings()) { - if (tensor_binding.first.is_grad == IsGrad::NO && - tensor_binding.second.role == TensorRole::INPUT) { - num_inputs += 1; - } - } for (auto const &tensor_binding : binding.get_tensor_bindings()) { SlotGradId slot_grad_id = tensor_binding.first; @@ -115,7 +111,9 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( int weight_adjusted_idx = 0; switch (tensor_spec.role) { case TensorRole::WEIGHT: - weight_adjusted_idx = num_inputs; + assert(contains_key(this->weight_tensor_slots, op_guid)); + tensor_guids = this->weight_tensor_slots.at(op_guid); + break; case TensorRole::INPUT: assert(contains_key(this->input_tensor_slots, op_guid)); tensor_guids = this->input_tensor_slots.at(op_guid); @@ -130,8 +128,8 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( } IsGrad is_grad = slot_grad_id.is_grad; - GenericTensorAccessorW tensor_backing = this->get_tensor_backing( - tensor_guids.at(weight_adjusted_idx + tensor_spec.idx), is_grad); + GenericTensorAccessorW tensor_backing = + this->get_tensor_backing(tensor_guids.at(tensor_spec.idx), is_grad); mapping.insert({slot_grad_id, tensor_backing}); } diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc index c18108d6b4..779ba43f26 100644 --- a/lib/local-execution/test/src/test_local_slots_backing.cc +++ b/lib/local-execution/test/src/test_local_slots_backing.cc @@ -157,11 +157,17 @@ TEST_SUITE(FF_TEST_SUITE) { local_slots_backing.allocate_outgoing_tensors( layer_guid, cg_builder.computation_graph, allocator); SUBCASE("Input tensor slots") { - std::vector correct_incoming_tensors = - get_incoming_tensors(cg_builder.computation_graph, layer_guid); - CHECK(correct_incoming_tensors == + std::vector correct_incoming_input_tensors = + get_incoming_inputs(cg_builder.computation_graph, layer_guid); + CHECK(correct_incoming_input_tensors == local_slots_backing.input_tensor_slots.at(layer_guid)); } + SUBCASE("Weight tensor slots") { + std::vector correct_incoming_weight_tensors = + get_incoming_weights(cg_builder.computation_graph, layer_guid); + CHECK(correct_incoming_weight_tensors == + local_slots_backing.weight_tensor_slots.at(layer_guid)); + } SUBCASE("Output tensor slots") { std::vector correct_outgoing_tensors = get_outgoing_tensors(cg_builder.computation_graph, layer_guid); From 7470e71eaa959f2304fc5e111b18f045473c3364 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 1 Oct 2024 11:53:07 -0700 Subject: [PATCH 16/91] Fix signature to use unified rep --- .../local-execution/local_slots_backing.h | 9 ++- .../non_graph_tensor_guid_t.struct.toml | 17 ++++++ .../include/local-execution/optimizer.h | 9 +-- .../task_signature.struct.toml | 5 +- .../tensor_guid_slot_spec.struct.toml | 5 -- .../tensor_guid_spec.struct.toml | 3 +- .../unified_tensor_guid.variant.toml | 21 +++++++ .../src/local_slots_backing.cc | 56 +++++++++++-------- .../src/local_training_backing.cc | 5 +- lib/local-execution/src/loss_functions.cc | 6 +- lib/local-execution/src/optimizer.cc | 22 ++++---- lib/local-execution/src/task_signature.cc | 4 +- lib/pcg/include/pcg/computation_graph.h | 3 - lib/pcg/src/pcg/computation_graph.cc | 13 ----- 14 files changed, 105 insertions(+), 73 deletions(-) create mode 100644 lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml create mode 100644 lib/local-execution/include/local-execution/unified_tensor_guid.variant.toml diff --git a/lib/local-execution/include/local-execution/local_slots_backing.h b/lib/local-execution/include/local-execution/local_slots_backing.h index 93c534e583..d201d3c405 100644 --- a/lib/local-execution/include/local-execution/local_slots_backing.h +++ b/lib/local-execution/include/local-execution/local_slots_backing.h @@ -4,10 +4,12 @@ #include "kernels/accessor.h" #include "local-execution/local_task_argument_accessor.h" +#include "local-execution/non_graph_tensor_guid_t.dtg.h" #include "local-execution/op_task_invocation.h" #include "local-execution/per_device_op_state.h" #include "local-execution/runtime_arg_config.h" #include "local-execution/task_invocation.dtg.h" +#include "local-execution/unified_tensor_guid.dtg.h" #include "pcg/computation_graph.dtg.h" #include "pcg/layer_guid_t.dtg.h" #include "pcg/tensor_guid_t.dtg.h" @@ -16,6 +18,8 @@ namespace FlexFlow { using TensorBackingMap = std::unordered_map; +using NonGraphTensorBackingMap = + std::unordered_map; struct LocalSlotsBacking { LocalSlotsBacking(TensorBackingMap const &, RuntimeArgConfig const &); @@ -42,7 +46,7 @@ struct LocalSlotsBacking { ConcreteArgSpec resolve_op_arg_ref_spec(OpArgRefSpec const &, layer_guid_t const &) const; - GenericTensorAccessorW const &get_tensor_backing(tensor_guid_t const &, + GenericTensorAccessorW const &get_tensor_backing(UnifiedTensorGuid const &, IsGrad) const; bool is_tensor_allocated(tensor_guid_t const &) const; @@ -52,13 +56,14 @@ struct LocalSlotsBacking { // tensors TensorBackingMap tensor_mapping; TensorBackingMap gradient_tensor_mapping; + NonGraphTensorBackingMap optimizer_tensor_mapping; std::unordered_map> input_tensor_slots; std::unordered_map> weight_tensor_slots; std::unordered_map> output_tensor_slots; - std::unordered_map> + std::unordered_map> weight_optimizer_tensor_guids; // arguments diff --git a/lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml b/lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml new file mode 100644 index 0000000000..8904c232c9 --- /dev/null +++ b/lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml @@ -0,0 +1,17 @@ +namespace = "FlexFlow" +name = "non_graph_tensor_guid_t" +features = [ + "eq", + "ord", + "hash", + "fmt", + "json", +] + +includes = [ + "", +] + +[[fields]] +name = "raw_uid" +type = "size_t" diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h index 1e2cd65362..acf9b8a550 100644 --- a/lib/local-execution/include/local-execution/optimizer.h +++ b/lib/local-execution/include/local-execution/optimizer.h @@ -1,6 +1,7 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_ #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_ +#include "local-execution/non_graph_tensor_guid_t.dtg.h" #include "local-execution/task_impl_function.dtg.h" #include "local-execution/task_invocation.dtg.h" #include "local-execution/task_signature.h" @@ -14,20 +15,20 @@ TaskSignature get_update_signature(OptimizerAttrs const &); TaskInvocation get_update_invocation( OptimizerAttrs const &, tensor_guid_t const &weight, - std::vector const &grad_buffer_tensors); + std::vector const &grad_buffer_tensors); TaskImplFunction get_update_task_impl(OptimizerAttrs const &); TaskSignature get_sgd_update_signature(); TaskInvocation sgd_update(SGDOptimizerAttrs const &, tensor_guid_t const &weight, - tensor_guid_t const &sgd_v); + non_graph_tensor_guid_t const &sgd_v); TaskImplFunction get_sgd_update_task_impl(); TaskSignature get_adam_update_signature(); TaskInvocation adam_update(AdamOptimizerAttrs const &, tensor_guid_t const &weight, - tensor_guid_t const &adam_v, - tensor_guid_t const &adam_m); + non_graph_tensor_guid_t const &adam_v, + non_graph_tensor_guid_t const &adam_m); TaskImplFunction get_adam_update_task_impl(); } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/task_signature.struct.toml b/lib/local-execution/include/local-execution/task_signature.struct.toml index fd15df91d5..ac408a7b68 100644 --- a/lib/local-execution/include/local-execution/task_signature.struct.toml +++ b/lib/local-execution/include/local-execution/task_signature.struct.toml @@ -8,15 +8,14 @@ features = [ includes = [ "local-execution/tensor_guid_slot_spec.dtg.h", + "local-execution/slot_id_t.dtg.h", "", "" ] src_includes = [ "utils/fmt/unordered_map.h", - "utils/fmt/unordered_set.h", "utils/hash/unordered_map.h", - "utils/hash/unordered_set.h", "utils/fmt/optional.h", "utils/type_index.h" ] @@ -31,4 +30,4 @@ type = "std::unordered_map<::FlexFlow::slot_id_t, std::type_index>" [[fields]] name = "tensor_guid_slots" -type = "std::unordered_set<::FlexFlow::TensorGuidSlotSpec>" +type = "std::unordered_map<::FlexFlow::slot_id_t, ::FlexFlow::TensorGuidSlotSpec>" diff --git a/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml b/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml index 4b3e5b2674..9b7e9c14f9 100644 --- a/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml +++ b/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml @@ -8,15 +8,10 @@ features = [ ] includes = [ - "local-execution/slot_id_t.dtg.h", "local-execution/slot_type.dtg.h", "local-execution/is_grad.dtg.h", ] -[[fields]] -name = "name" -type = "::FlexFlow::slot_id_t" - [[fields]] name = "slot_type" type = "::FlexFlow::SlotType" diff --git a/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml b/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml index a51d6ccf1b..1d147f60e5 100644 --- a/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml +++ b/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml @@ -10,11 +10,12 @@ features = [ includes = [ "pcg/tensor_guid_t.dtg.h", "local-execution/is_grad.dtg.h", + "local-execution/unified_tensor_guid.dtg.h" ] [[fields]] name = "tensor_guid" -type = "::FlexFlow::tensor_guid_t" +type = "::FlexFlow::UnifiedTensorGuid" [[fields]] name = "is_grad" diff --git a/lib/local-execution/include/local-execution/unified_tensor_guid.variant.toml b/lib/local-execution/include/local-execution/unified_tensor_guid.variant.toml new file mode 100644 index 0000000000..3d2cd8e45f --- /dev/null +++ b/lib/local-execution/include/local-execution/unified_tensor_guid.variant.toml @@ -0,0 +1,21 @@ +namespace = "FlexFlow" +name = "UnifiedTensorGuid" +features = [ + "eq", + "ord", + "hash", + "fmt", +] + +includes = [ + "pcg/tensor_guid_t.dtg.h", + "local-execution/non_graph_tensor_guid_t.dtg.h", +] + +[[values]] +type = "::FlexFlow::tensor_guid_t" +key = "tensor_guid" + +[[values]] +type = "::FlexFlow::non_graph_tensor_guid_t" +key = "non_graph_tensor_guid" diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc index bdbfa4f222..f10b7c0126 100644 --- a/lib/local-execution/src/local_slots_backing.cc +++ b/lib/local-execution/src/local_slots_backing.cc @@ -60,18 +60,19 @@ void LocalSlotsBacking::allocate_optimizer_tensors( Allocator &allocator, TaskSignature const &sig) { GenericTensorAccessorW weight_backing = - get_tensor_backing(weight, IsGrad::NO); + get_tensor_backing(UnifiedTensorGuid{weight}, IsGrad::NO); int num_grad_buffer_tensors = sig.tensor_guid_slots.size() - 2; // ignore 2 (weight and weight_grad) - std::vector grad_buffer_tensors = - get_new_tensor_guids_for_layer_without_graph_insertion( - cg, weight_layer, num_grad_buffer_tensors); - for (tensor_guid_t const &tensor_guid : grad_buffer_tensors) { + std::vector grad_buffer_tensors; + for (int i = 0; i < num_grad_buffer_tensors; ++i) { + non_graph_tensor_guid_t buffer_tensor_guid = non_graph_tensor_guid_t{i}; GenericTensorAccessorW buffer_backing = allocator.allocate_tensor( get_tensor_shape(weight_backing.shape, weight_backing.data_type)); - this->gradient_tensor_mapping.insert({tensor_guid, buffer_backing}); + this->optimizer_tensor_mapping.insert({buffer_tensor_guid, buffer_backing}); + grad_buffer_tensors.push_back(buffer_tensor_guid); } - this->weight_optimizer_tensor_guids.insert({weight, grad_buffer_tensors}); + this->weight_optimizer_tensor_guids.insert( + {weight_layer, grad_buffer_tensors}); } bool LocalSlotsBacking::is_tensor_allocated( @@ -85,18 +86,26 @@ bool LocalSlotsBacking::is_gradient_tensor_allocated( } GenericTensorAccessorW const & - LocalSlotsBacking::get_tensor_backing(tensor_guid_t const &tensor_id, + LocalSlotsBacking::get_tensor_backing(UnifiedTensorGuid const &tensor_id, IsGrad is_grad) const { - switch (is_grad) { - case IsGrad::NO: - assert(contains_key(this->tensor_mapping, tensor_id)); - return this->tensor_mapping.at(tensor_id); - case IsGrad::YES: - assert(contains_key(this->gradient_tensor_mapping, tensor_id)); - return this->gradient_tensor_mapping.at(tensor_id); - default: - throw mk_runtime_error(fmt::format( - "IsGrad should only have YES or NO, received {}", is_grad)); + if (tensor_id.has()) { + tensor_guid_t graph_tensor_guid = tensor_id.get(); + switch (is_grad) { + case IsGrad::NO: + assert(contains_key(this->tensor_mapping, graph_tensor_guid)); + return this->tensor_mapping.at(graph_tensor_guid); + case IsGrad::YES: + assert(contains_key(this->gradient_tensor_mapping, graph_tensor_guid)); + return this->gradient_tensor_mapping.at(graph_tensor_guid); + default: + throw mk_runtime_error(fmt::format( + "IsGrad should only have YES or NO, received {}", is_grad)); + } + } else { + non_graph_tensor_guid_t non_graph_tensor_guid = + tensor_id.get(); + assert(contains_key(this->optimizer_tensor_mapping, non_graph_tensor_guid)); + return this->optimizer_tensor_mapping.at(non_graph_tensor_guid); } } @@ -128,8 +137,8 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( } IsGrad is_grad = slot_grad_id.is_grad; - GenericTensorAccessorW tensor_backing = - this->get_tensor_backing(tensor_guids.at(tensor_spec.idx), is_grad); + GenericTensorAccessorW tensor_backing = this->get_tensor_backing( + UnifiedTensorGuid{tensor_guids.at(tensor_spec.idx)}, is_grad); mapping.insert({slot_grad_id, tensor_backing}); } @@ -144,8 +153,8 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( SlotGradId slot_grad_id = tensor_binding.first; TensorGuidSpec tensor_spec = tensor_binding.second; - GenericTensorAccessorW accessor = - this->get_tensor_backing(tensor_spec.tensor_guid, slot_grad_id.is_grad); + GenericTensorAccessorW accessor = this->get_tensor_backing( + UnifiedTensorGuid{tensor_spec.tensor_guid}, slot_grad_id.is_grad); mapping.insert({slot_grad_id, accessor}); } @@ -199,7 +208,8 @@ ConcreteArgSpec LocalSlotsBacking::resolve_op_arg_ref_spec( assert(input_tensor_guids.size() > index_op_arg_ref.idx); GenericTensorAccessorW tensor_backing = this->get_tensor_backing( - input_tensor_guids.at(index_op_arg_ref.idx), IsGrad::NO); + UnifiedTensorGuid{input_tensor_guids.at(index_op_arg_ref.idx)}, + IsGrad::NO); ParallelTensorShape shape = lift_to_parallel( get_tensor_shape(tensor_backing.shape, tensor_backing.data_type)); return ConcreteArgSpec::create(shape); diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index edbb377047..dafa28a70f 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -162,9 +162,8 @@ void LocalTrainingBacking::execute_update() { // get tensors tensor_guid_t weight_tensor = get_only(get_outgoing_tensors(this->computation_graph, node)); - std::vector grad_buffer_tensors = - this->local_slots_backing.weight_optimizer_tensor_guids.at( - weight_tensor); + std::vector grad_buffer_tensors = + this->local_slots_backing.weight_optimizer_tensor_guids.at(node); // get invocation TaskInvocation invocation = diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc index 3a4c616377..a37c1d706b 100644 --- a/lib/local-execution/src/loss_functions.cc +++ b/lib/local-execution/src/loss_functions.cc @@ -35,9 +35,9 @@ TaskSignature get_loss_bwd_signature() { TaskInvocation backward(LossAttrs const &attrs, tensor_guid_t logit, tensor_guid_t label) { TaskBinding b; - b.bind(LOGIT, TensorGuidSpec{logit, IsGrad::NO}); - b.bind(LABEL, TensorGuidSpec{label, IsGrad::NO}); - b.bind(LOGIT, TensorGuidSpec{logit, IsGrad::YES}); + b.bind(LOGIT, TensorGuidSpec{UnifiedTensorGuid{logit}, IsGrad::NO}); + b.bind(LABEL, TensorGuidSpec{UnifiedTensorGuid{label}, IsGrad::NO}); + b.bind(LOGIT, TensorGuidSpec{UnifiedTensorGuid{logit}, IsGrad::YES}); b.bind_arg(ATTRS, attrs); b.bind_arg(PROFILING, profiling_settings()); diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc index 30f20bf8ec..1e06dee96a 100644 --- a/lib/local-execution/src/optimizer.cc +++ b/lib/local-execution/src/optimizer.cc @@ -22,12 +22,12 @@ TaskSignature get_sgd_update_signature() { TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs, tensor_guid_t const &weight, - tensor_guid_t const &sgd_v) { + non_graph_tensor_guid_t const &sgd_v) { TaskBinding b; - b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::YES}); - b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::NO}); + b.bind(WEIGHT, TensorGuidSpec{UnifiedTensorGuid{weight}, IsGrad::YES}); + b.bind(WEIGHT, TensorGuidSpec{UnifiedTensorGuid{weight}, IsGrad::NO}); if (attrs.momentum > 0.0f) { - b.bind(SGD_V, TensorGuidSpec{sgd_v, IsGrad::YES}); + b.bind(SGD_V, TensorGuidSpec{UnifiedTensorGuid{sgd_v}, IsGrad::YES}); } b.bind_arg(ATTRS, attrs); b.bind_arg(PROFILING, profiling_settings()); @@ -111,13 +111,13 @@ TaskSignature get_adam_update_signature() { TaskInvocation adam_update(AdamOptimizerAttrs const &attrs, tensor_guid_t const &weight, - tensor_guid_t const &adam_v, - tensor_guid_t const &adam_m) { + non_graph_tensor_guid_t const &adam_v, + non_graph_tensor_guid_t const &adam_m) { TaskBinding b; - b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::YES}); - b.bind(WEIGHT, TensorGuidSpec{weight, IsGrad::NO}); - b.bind(ADAM_M, TensorGuidSpec{adam_m, IsGrad::YES}); - b.bind(ADAM_V, TensorGuidSpec{adam_v, IsGrad::YES}); + b.bind(WEIGHT, TensorGuidSpec{UnifiedTensorGuid{weight}, IsGrad::YES}); + b.bind(WEIGHT, TensorGuidSpec{UnifiedTensorGuid{weight}, IsGrad::NO}); + b.bind(ADAM_M, TensorGuidSpec{UnifiedTensorGuid{adam_m}, IsGrad::YES}); + b.bind(ADAM_V, TensorGuidSpec{UnifiedTensorGuid{adam_v}, IsGrad::YES}); b.bind_arg(ATTRS, attrs); b.bind_arg(PROFILING, profiling_settings()); @@ -192,7 +192,7 @@ TaskSignature get_update_signature(OptimizerAttrs const &attrs) { TaskInvocation get_update_invocation( OptimizerAttrs const &attrs, tensor_guid_t const &weight, - std::vector const &grad_buffer_tensors) { + std::vector const &grad_buffer_tensors) { return attrs.visit(overload{ [&](SGDOptimizerAttrs const &s) { return sgd_update(s, weight, grad_buffer_tensors.at(0)); diff --git a/lib/local-execution/src/task_signature.cc b/lib/local-execution/src/task_signature.cc index 3bba9e2c8a..27bcbcd266 100644 --- a/lib/local-execution/src/task_signature.cc +++ b/lib/local-execution/src/task_signature.cc @@ -18,8 +18,8 @@ void add_slot(TaskSignature &task_signature, IsGrad is_grad, SlotType slot_type) { TensorGuidSlotSpec tensor_guid_slot_spec = - TensorGuidSlotSpec{name, slot_type, is_grad}; - task_signature.tensor_guid_slots.insert(tensor_guid_slot_spec); + TensorGuidSlotSpec{slot_type, is_grad}; + task_signature.tensor_guid_slots.insert({name, tensor_guid_slot_spec}); } } // namespace FlexFlow diff --git a/lib/pcg/include/pcg/computation_graph.h b/lib/pcg/include/pcg/computation_graph.h index 32ed0e3025..f70d9f7404 100644 --- a/lib/pcg/include/pcg/computation_graph.h +++ b/lib/pcg/include/pcg/computation_graph.h @@ -52,9 +52,6 @@ LayerAttrs get_layer_attrs(ComputationGraph const &cg, layer_guid_t const &n); layer_guid_t get_layer_by_name(ComputationGraph const &cg, std::string const &name); -std::vector - get_new_tensor_guids_for_layer_without_graph_insertion( - ComputationGraph const &, layer_guid_t const &n, int num_tensors); std::string as_dot(ComputationGraph const &); void debug_print_dot(ComputationGraph const &); diff --git a/lib/pcg/src/pcg/computation_graph.cc b/lib/pcg/src/pcg/computation_graph.cc index 6f6c10d798..a69e54fd93 100644 --- a/lib/pcg/src/pcg/computation_graph.cc +++ b/lib/pcg/src/pcg/computation_graph.cc @@ -175,19 +175,6 @@ layer_guid_t get_layer_by_name(ComputationGraph const &cg, return get_only(found); } -std::vector - get_new_tensor_guids_for_layer_without_graph_insertion( - ComputationGraph const &cg, layer_guid_t const &n, int num_tensors) { - std::vector new_tensor_guids; - int num_outgoing_tensors = get_outgoing_tensors(cg, n).size(); - - for (int i = 0; i < num_tensors; ++i) { - new_tensor_guids.push_back( - tensor_guid_t{DataflowOutput{n.raw_node, num_outgoing_tensors + i}}); - } - return new_tensor_guids; -} - std::string as_dot(ComputationGraph const &cg) { std::function get_node_label = [](LayerAttrs const &a) -> std::string { From deece1be7eae96ef4604679a13c2ec58207632e3 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 1 Oct 2024 12:39:29 -0700 Subject: [PATCH 17/91] Fix model training instance abstraction --- .../local-execution/local_training_backing.h | 5 +++- .../local-execution/model_training_instance.h | 13 -------- .../model_training_instance.struct.toml | 7 +---- .../non_graph_tensor_guid_t.struct.toml | 6 +--- .../src/local_cost_estimator.cc | 4 ++- .../src/local_training_backing.cc | 22 +++++++------- .../src/model_training_instance.cc | 30 ------------------- lib/local-execution/test/src/test_loss_e2e.cc | 24 +++++++-------- .../test/src/test_update_e2e.cc | 18 +++++------ lib/pcg/include/pcg/optimizer_attrs.h | 13 ++++++++ lib/pcg/src/pcg/optimizer_attrs.cc | 24 +++++++++++++++ 11 files changed, 79 insertions(+), 87 deletions(-) delete mode 100644 lib/local-execution/include/local-execution/model_training_instance.h delete mode 100644 lib/local-execution/src/model_training_instance.cc create mode 100644 lib/pcg/include/pcg/optimizer_attrs.h create mode 100644 lib/pcg/src/pcg/optimizer_attrs.cc diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h index 08a458cb7f..2313d55732 100644 --- a/lib/local-execution/include/local-execution/local_training_backing.h +++ b/lib/local-execution/include/local-execution/local_training_backing.h @@ -5,6 +5,7 @@ #include "local-execution/model_training_instance.dtg.h" #include "local-execution/task_registry.h" #include "pcg/computation_graph.dtg.h" +#include "pcg/optimizer_attrs.dtg.h" namespace FlexFlow { @@ -16,7 +17,8 @@ struct LocalTrainingBacking { ComputationGraph const &, TensorBackingMap const &, RuntimeArgConfig const &, - std::optional &); + std::optional const &, + std::optional const &); void execute_init(); PerLayerElapsedTime execute_forward(); @@ -38,6 +40,7 @@ struct LocalTrainingBacking { TaskRegistry task_registry; LocalSlotsBacking local_slots_backing; std::optional training_instance; + std::optional optimizer_attrs; }; } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h deleted file mode 100644 index afc8fa7472..0000000000 --- a/lib/local-execution/include/local-execution/model_training_instance.h +++ /dev/null @@ -1,13 +0,0 @@ - -#ifndef _FLEXFLOW_LOCAL_EXECUTION_MODEL_TRAINING_INSTANCE_H -#define _FLEXFLOW_LOCAL_EXECUTION_MODEL_TRAINING_INSTANCE_H - -#include "local-execution/model_training_instance.dtg.h" - -namespace FlexFlow { - -ModelTrainingInstance next(ModelTrainingInstance const &old); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/model_training_instance.struct.toml b/lib/local-execution/include/local-execution/model_training_instance.struct.toml index 28282e21c0..dcfaf2175d 100644 --- a/lib/local-execution/include/local-execution/model_training_instance.struct.toml +++ b/lib/local-execution/include/local-execution/model_training_instance.struct.toml @@ -9,8 +9,7 @@ features = [ includes = [ "op-attrs/ops/loss_functions/loss_attrs.dtg.h", - "pcg/tensor_guid_t.dtg.h", - "pcg/optimizer_attrs.dtg.h", + "pcg/tensor_guid_t.dtg.h" ] [[fields]] @@ -24,7 +23,3 @@ type = "::FlexFlow::tensor_guid_t" [[fields]] name = "logit_tensor" type = "::FlexFlow::tensor_guid_t" - -[[fields]] -name = "optimizer_attrs" -type = "::FlexFlow::OptimizerAttrs" diff --git a/lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml b/lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml index 8904c232c9..4832ecaafa 100644 --- a/lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml +++ b/lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml @@ -8,10 +8,6 @@ features = [ "json", ] -includes = [ - "", -] - [[fields]] name = "raw_uid" -type = "size_t" +type = "int" diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc index f153db3240..186c2d516a 100644 --- a/lib/local-execution/src/local_cost_estimator.cc +++ b/lib/local-execution/src/local_cost_estimator.cc @@ -76,11 +76,13 @@ CostDetails LocalCostEstimator::estimate_cost( get_vector_piece_attrs(outputs)); std::optional model_training_instance = std::nullopt; + std::optional optimizer_attrs = std::nullopt; LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, this->runtime_arg_config, - model_training_instance); + model_training_instance, + optimizer_attrs); local_backing.execute_init(); PerLayerElapsedTime fwd = local_backing.execute_forward(); diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index dafa28a70f..46a8f83709 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -1,10 +1,10 @@ #include "local-execution/local_training_backing.h" #include "local-execution/loss_functions.h" -#include "local-execution/model_training_instance.h" #include "local-execution/optimizer.h" #include "local-execution/task_invocation.h" #include "local-execution/task_signature_impl.h" #include "pcg/computation_graph.h" +#include "pcg/optimizer_attrs.h" #include "utils/containers/contains.h" #include "utils/containers/contains_key.h" #include "utils/containers/get_only.h" @@ -18,11 +18,12 @@ LocalTrainingBacking::LocalTrainingBacking( ComputationGraph const &computation_graph, TensorBackingMap const &tensor_backing_mapping, RuntimeArgConfig const &runtime_arg_config, - std::optional &training_instance) + std::optional const &training_instance, + std::optional const &optimizer_attrs) : allocator(allocator), computation_graph(computation_graph), local_slots_backing(tensor_backing_mapping, runtime_arg_config), task_registry(empty_task_registry()), - training_instance(training_instance) { + training_instance(training_instance), optimizer_attrs(optimizer_attrs) { for (layer_guid_t const &node : topological_ordering(this->computation_graph)) { @@ -38,8 +39,8 @@ LocalTrainingBacking::LocalTrainingBacking( // allocate optimizer buffers if (attrs.has() && this->training_instance.has_value()) { - OptimizerAttrs attrs = this->training_instance.value().optimizer_attrs; - TaskSignature sig = get_update_signature(attrs); + assert(this->optimizer_attrs.has_value()); + TaskSignature sig = get_update_signature(this->optimizer_attrs.value()); tensor_guid_t weight_tensor = get_only(get_outgoing_tensors(this->computation_graph, node)); this->local_slots_backing.allocate_optimizer_tensors( @@ -153,7 +154,7 @@ PerLayerElapsedTime LocalTrainingBacking::execute_backward() { void LocalTrainingBacking::execute_update() { assert(this->training_instance.has_value()); - OptimizerAttrs attrs = this->training_instance.value().optimizer_attrs; + assert(this->optimizer_attrs.has_value()); for (layer_guid_t const &node : topological_ordering(this->computation_graph)) { @@ -166,18 +167,19 @@ void LocalTrainingBacking::execute_update() { this->local_slots_backing.weight_optimizer_tensor_guids.at(node); // get invocation - TaskInvocation invocation = - get_update_invocation(attrs, weight_tensor, grad_buffer_tensors); + TaskInvocation invocation = get_update_invocation( + this->optimizer_attrs.value(), weight_tensor, grad_buffer_tensors); // assert(is_invocation_valid(get_update_signature(attrs), invocation)); // execute update TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation); - TaskImplFunction update_impl_fn = get_update_task_impl(attrs); + TaskImplFunction update_impl_fn = + get_update_task_impl(this->optimizer_attrs.value()); update_impl_fn.get().function_ptr(accessor); } } - this->training_instance = next(this->training_instance.value()); + this->optimizer_attrs = next(this->optimizer_attrs.value()); } TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor( diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc deleted file mode 100644 index c626bfc0e0..0000000000 --- a/lib/local-execution/src/model_training_instance.cc +++ /dev/null @@ -1,30 +0,0 @@ -#include "local-execution/model_training_instance.h" - -namespace FlexFlow { - -ModelTrainingInstance next(ModelTrainingInstance const &old_training_instance) { - if (old_training_instance.optimizer_attrs.has()) { - AdamOptimizerAttrs old = - old_training_instance.optimizer_attrs.get(); - double new_beta1_t = old.beta_t * old.beta1; - double new_beta2_t = old.beta2_t * old.beta2; - double new_alpha_t = old.alpha * sqrt(1 - new_beta2_t) / (1 - new_beta1_t); - OptimizerAttrs new_attrs = - OptimizerAttrs{AdamOptimizerAttrs{old.alpha, - old.beta1, - old.beta2, - old.weight_decay, - new_alpha_t, - new_beta1_t, - new_beta2_t, - old.epsilon}}; - return ModelTrainingInstance{old_training_instance.loss_attrs, - old_training_instance.label_tensor, - old_training_instance.logit_tensor, - new_attrs}; - } else { - return old_training_instance; - } -} - -} // namespace FlexFlow diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc index 4801aff6a9..72df1a08f1 100644 --- a/lib/local-execution/test/src/test_loss_e2e.cc +++ b/lib/local-execution/test/src/test_loss_e2e.cc @@ -59,13 +59,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { LossAttrs{SparseCategoricalCrossEntropyLossAttrs{ /*replace_labels=*/false}}, label_tensor, - logit_tensor, - optimizer_attrs}; + logit_tensor}; LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, - model_training_instance); + model_training_instance, + optimizer_attrs); local_backing.execute_init(); local_backing.execute_forward(); local_backing.execute_backward(); @@ -83,13 +83,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { ModelTrainingInstance{LossAttrs{NonconfigurableLossAttrs{ LossFunction::CATEGORICAL_CROSSENTROPY}}, label_tensor, - logit_tensor, - optimizer_attrs}; + logit_tensor}; LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, - model_training_instance); + model_training_instance, + optimizer_attrs); local_backing.execute_init(); local_backing.execute_forward(); local_backing.execute_backward(); @@ -101,13 +101,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { LossAttrs{NonconfigurableLossAttrs{ LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, label_tensor, - logit_tensor, - optimizer_attrs}; + logit_tensor}; LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, - model_training_instance); + model_training_instance, + optimizer_attrs); local_backing.execute_init(); local_backing.execute_forward(); local_backing.execute_backward(); @@ -118,13 +118,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { ModelTrainingInstance{ LossAttrs{NonconfigurableLossAttrs{LossFunction::IDENTITY}}, label_tensor, - logit_tensor, - optimizer_attrs}; + logit_tensor}; LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, - model_training_instance); + model_training_instance, + optimizer_attrs); local_backing.execute_init(); local_backing.execute_forward(); local_backing.execute_backward(); diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc index af4303fab8..96b748806f 100644 --- a/lib/local-execution/test/src/test_update_e2e.cc +++ b/lib/local-execution/test/src/test_update_e2e.cc @@ -58,13 +58,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { LossAttrs{NonconfigurableLossAttrs{ LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, label_tensor, - logit_tensor, - optimizer_attrs}; + logit_tensor}; LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, - model_training_instance); + model_training_instance, + optimizer_attrs); local_backing.execute_init(); local_backing.execute_forward(); local_backing.execute_backward(); @@ -81,13 +81,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { LossAttrs{NonconfigurableLossAttrs{ LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, label_tensor, - logit_tensor, - optimizer_attrs}; + logit_tensor}; LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, - model_training_instance); + model_training_instance, + optimizer_attrs); local_backing.execute_init(); local_backing.execute_forward(); local_backing.execute_backward(); @@ -109,13 +109,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { LossAttrs{NonconfigurableLossAttrs{ LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, label_tensor, - logit_tensor, - optimizer_attrs}; + logit_tensor}; LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, runtime_arg_config, - model_training_instance); + model_training_instance, + optimizer_attrs); local_backing.execute_init(); local_backing.execute_forward(); local_backing.execute_backward(); diff --git a/lib/pcg/include/pcg/optimizer_attrs.h b/lib/pcg/include/pcg/optimizer_attrs.h new file mode 100644 index 0000000000..4b78f66fe4 --- /dev/null +++ b/lib/pcg/include/pcg/optimizer_attrs.h @@ -0,0 +1,13 @@ + +#ifndef _FLEXFLOW_PCG_OPTIMIZER_ATTRS_H +#define _FLEXFLOW_PCG_OPTIMIZER_ATTRS_H + +#include "pcg/optimizer_attrs.dtg.h" + +namespace FlexFlow { + +OptimizerAttrs next(OptimizerAttrs const &old); + +} // namespace FlexFlow + +#endif diff --git a/lib/pcg/src/pcg/optimizer_attrs.cc b/lib/pcg/src/pcg/optimizer_attrs.cc new file mode 100644 index 0000000000..5307450a68 --- /dev/null +++ b/lib/pcg/src/pcg/optimizer_attrs.cc @@ -0,0 +1,24 @@ +#include "pcg/optimizer_attrs.h" + +namespace FlexFlow { + +OptimizerAttrs next(OptimizerAttrs const &old_attrs) { + if (old_attrs.has()) { + AdamOptimizerAttrs old = old_attrs.get(); + double new_beta1_t = old.beta_t * old.beta1; + double new_beta2_t = old.beta2_t * old.beta2; + double new_alpha_t = old.alpha * sqrt(1 - new_beta2_t) / (1 - new_beta1_t); + return OptimizerAttrs{AdamOptimizerAttrs{old.alpha, + old.beta1, + old.beta2, + old.weight_decay, + new_alpha_t, + new_beta1_t, + new_beta2_t, + old.epsilon}}; + } else { + return old_attrs; + } +} + +} // namespace FlexFlow From 1d3cc9498fb5afe5e9f1b0aa1e50260a58e1c424 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 1 Oct 2024 13:10:28 -0700 Subject: [PATCH 18/91] Change subcase test name --- lib/local-execution/test/src/test_loss_e2e.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc index 72df1a08f1..37024adc26 100644 --- a/lib/local-execution/test/src/test_loss_e2e.cc +++ b/lib/local-execution/test/src/test_loss_e2e.cc @@ -71,7 +71,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { local_backing.execute_backward(); } - SUBCASE("OtherAttrs") { + SUBCASE("NonconfigurableLossAttrs") { tensor_guid_t label_tensor = cg_builder.create_input(input_shape, CreateGrad::NO); GenericTensorAccessorW label_backing = From 3cf5d08fb3b56f0e70145179c5dfd72eacd3cc2e Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 16 Oct 2024 12:34:59 -0700 Subject: [PATCH 19/91] Quick fixes --- lib/kernels/include/kernels/array_shape.h | 8 ++++---- lib/kernels/include/kernels/legion_dim.h | 3 +++ lib/kernels/src/allocation.cc | 2 +- lib/kernels/src/array_shape.cc | 7 +++++-- lib/kernels/src/legion_dim.cc | 9 +++++++++ .../include/local-execution/arg_ref.h | 17 ++++++++++++++-- .../include/local-execution/concrete_arg.h | 14 +++++++++++++ .../include/local-execution/runtime_arg_ref.h | 16 +++++++++++++++ .../task_arg_spec.variant.toml | 4 +++- .../include/local-execution/task_binding.h | 18 +++++++++++++++++ .../task_invocation.struct.toml | 4 +++- .../src/local_training_backing.cc | 4 ++-- lib/local-execution/src/ops/element_unary.cc | 6 ++++-- lib/local-execution/src/runtime_arg_ref.cc | 13 ++++++++++++ lib/local-execution/src/task_binding.cc | 13 ++++++++++++ .../test/src/test_local_cost_estimator.cc | 20 ++++++++++++------- lib/pcg/include/pcg/optimizer_attrs.h | 2 +- .../parallel_computation_graph_builder.h | 4 ++++ lib/pcg/src/pcg/optimizer_attrs.cc | 3 ++- .../parallel_computation_graph_builder.cc | 2 +- 20 files changed, 144 insertions(+), 25 deletions(-) diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h index e60f0cd9c1..fd66697793 100644 --- a/lib/kernels/include/kernels/array_shape.h +++ b/lib/kernels/include/kernels/array_shape.h @@ -14,10 +14,10 @@ namespace FlexFlow { struct ArrayShape { public: ArrayShape() = delete; - ArrayShape(size_t *dims, size_t num_dims); - ArrayShape(TensorShape const &shape); - ArrayShape(std::vector const &); - ArrayShape(LegionTensorDims const &); + explicit ArrayShape(size_t *dims, size_t num_dims); + explicit ArrayShape(TensorShape const &shape); + explicit ArrayShape(std::vector const &); + explicit ArrayShape(LegionTensorDims const &); /** * @brief Alias of ArrayShape::num_elements for compatibility with diff --git a/lib/kernels/include/kernels/legion_dim.h b/lib/kernels/include/kernels/legion_dim.h index e4dd9723b8..29c5e29a93 100644 --- a/lib/kernels/include/kernels/legion_dim.h +++ b/lib/kernels/include/kernels/legion_dim.h @@ -10,6 +10,9 @@ legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value); legion_dim_t legion_dim_from_ff_dim(ff_dim_t, int num_dimensions); +std::optional legion_dim_from_ff_dim(std::optional, + int num_dimensions); + template using LegionOrdered = DimOrdered; diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc index ccd88580db..b57fbee257 100644 --- a/lib/kernels/src/allocation.cc +++ b/lib/kernels/src/allocation.cc @@ -14,7 +14,7 @@ void Allocator::deallocate(void *ptr) { GenericTensorAccessorW Allocator::allocate_tensor(TensorShape const &tensor_shape) { void *ptr = this->allocate(get_size_in_bytes(tensor_shape)); - return {tensor_shape.data_type, tensor_shape, ptr}; + return {tensor_shape.data_type, ArrayShape{tensor_shape}, ptr}; } } // namespace FlexFlow diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index 8464212290..31ee7b6001 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -61,8 +61,11 @@ ArrayShape ArrayShape::sub_shape(legion_dim_t start, ff_dim_t end) const { ArrayShape ArrayShape::sub_shape(std::optional start, std::optional end) const { - return ArrayShape{legion_dims_from_ff_dims( - slice(ff_ordered_from_legion_ordered(this->dims), start, end))}; + std::optional legion_start = + legion_dim_from_ff_dim(start, num_dims()); + std::optional legion_end = + legion_dim_from_ff_dim(end, num_dims()); + return this->sub_shape(legion_start, legion_end); } ArrayShape ArrayShape::sub_shape(std::optional start, diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/legion_dim.cc index 9ef47d40ae..c190a02220 100644 --- a/lib/kernels/src/legion_dim.cc +++ b/lib/kernels/src/legion_dim.cc @@ -10,4 +10,13 @@ legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim, int num_dimensions) { return legion_dim_t(num_dimensions - ff_dim.value - 1); } +std::optional + legion_dim_from_ff_dim(std::optional ff_dim, int num_dimensions) { + if (ff_dim.has_value()) { + return legion_dim_from_ff_dim(ff_dim.value(), num_dimensions); + } else { + return std::nullopt; + } +} + } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/arg_ref.h b/lib/local-execution/include/local-execution/arg_ref.h index 30326b0e84..30da405c13 100644 --- a/lib/local-execution/include/local-execution/arg_ref.h +++ b/lib/local-execution/include/local-execution/arg_ref.h @@ -60,6 +60,20 @@ struct ArgRefSpec { friend struct std::hash>; }; +template +std::string format_as(ArgRefSpec const &x) { + std::ostringstream oss; + oss << ""; + return oss.str(); +} + +template +std::ostream &operator<<(std::ostream &s, ArgRefSpec const &x) { + return (s << fmt::to_string(x)); +} + } // namespace FlexFlow namespace std { @@ -68,8 +82,7 @@ template struct hash<::FlexFlow::ArgRefSpec> { size_t operator()(::FlexFlow::ArgRefSpec const &s) const { size_t result = 0; - hash_combine(s.type_idx); - hash_combine(s.ref_type); + ::FlexFlow::hash_combine(result, s.type_idx); return result; } }; diff --git a/lib/local-execution/include/local-execution/concrete_arg.h b/lib/local-execution/include/local-execution/concrete_arg.h index 3bc2714a71..ac5d97f3c4 100644 --- a/lib/local-execution/include/local-execution/concrete_arg.h +++ b/lib/local-execution/include/local-execution/concrete_arg.h @@ -3,6 +3,7 @@ #include "fmt/format.h" #include "local-execution/serialization.h" +#include "utils/hash-utils.h" #include "utils/type_index.h" #include @@ -53,4 +54,17 @@ std::ostream &operator<<(std::ostream &, ConcreteArgSpec const &); } // namespace FlexFlow +namespace std { + +template <> +struct hash<::FlexFlow::ConcreteArgSpec> { + size_t operator()(::FlexFlow::ConcreteArgSpec const &s) const { + size_t result = 0; + ::FlexFlow::hash_combine(result, s.get_type_index()); + return result; + } +}; + +} // namespace std + #endif diff --git a/lib/local-execution/include/local-execution/runtime_arg_ref.h b/lib/local-execution/include/local-execution/runtime_arg_ref.h index 279d854a27..fd79e23126 100644 --- a/lib/local-execution/include/local-execution/runtime_arg_ref.h +++ b/lib/local-execution/include/local-execution/runtime_arg_ref.h @@ -5,6 +5,8 @@ #include "local-execution/config.h" #include "local-execution/device_specific.h" #include "local-execution/profiling.h" +#include "utils/fmt.h" +#include "utils/type_index.h" namespace FlexFlow { @@ -14,6 +16,8 @@ enum class RuntimeArgRefType { FF_ITERATION_CONFIG }; +std::string to_string(RuntimeArgRefType const &); + template using RuntimeArgRef = ArgRef; @@ -23,6 +27,18 @@ RuntimeArgRef profiling_settings(); RuntimeArgRef> ff_handle(); RuntimeArgRef iteration_config(); +// std::string format_as(RuntimeArgRefSpec const & x) { +// std::ostringstream oss; +// oss << ""; +// return oss.str(); +// } + +// std::ostream &operator<<(std::ostream & s, RuntimeArgRefSpec const & x) { +// return (s << fmt::to_string(x)); +// } + } // namespace FlexFlow #endif diff --git a/lib/local-execution/include/local-execution/task_arg_spec.variant.toml b/lib/local-execution/include/local-execution/task_arg_spec.variant.toml index a6df0c8a7d..271e3b73d6 100644 --- a/lib/local-execution/include/local-execution/task_arg_spec.variant.toml +++ b/lib/local-execution/include/local-execution/task_arg_spec.variant.toml @@ -1,7 +1,9 @@ namespace = "FlexFlow" name = "TaskArgSpec" features = [ - "eq" + "eq", + "fmt", + "hash" ] includes = [ diff --git a/lib/local-execution/include/local-execution/task_binding.h b/lib/local-execution/include/local-execution/task_binding.h index cbe210f438..96c96473e4 100644 --- a/lib/local-execution/include/local-execution/task_binding.h +++ b/lib/local-execution/include/local-execution/task_binding.h @@ -7,6 +7,7 @@ #include "local-execution/task_id_t.dtg.h" #include "local-execution/task_signature.dtg.h" #include "local-execution/tensor_guid_spec.dtg.h" +#include "utils/hash/unordered_map.h" namespace FlexFlow { @@ -53,6 +54,23 @@ struct TaskBinding { tie() const; }; +std::string format_as(TaskBinding const &x); +std::ostream &operator<<(std::ostream &s, TaskBinding const &x); + } // namespace FlexFlow +namespace std { + +template <> +struct hash<::FlexFlow::TaskBinding> { + size_t operator()(::FlexFlow::TaskBinding const &s) const { + size_t result = 0; + hash_combine(result, s.get_tensor_bindings()); + hash_combine(result, s.get_arg_bindings()); + return result; + } +}; + +} // namespace std + #endif diff --git a/lib/local-execution/include/local-execution/task_invocation.struct.toml b/lib/local-execution/include/local-execution/task_invocation.struct.toml index abcaabda93..c9e1e22ba1 100644 --- a/lib/local-execution/include/local-execution/task_invocation.struct.toml +++ b/lib/local-execution/include/local-execution/task_invocation.struct.toml @@ -1,7 +1,9 @@ namespace = "FlexFlow" name = "TaskInvocation" features = [ - "eq" + "eq", + "fmt", + "hash" ] includes = [ diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index 46a8f83709..b7631470b7 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -39,7 +39,6 @@ LocalTrainingBacking::LocalTrainingBacking( // allocate optimizer buffers if (attrs.has() && this->training_instance.has_value()) { - assert(this->optimizer_attrs.has_value()); TaskSignature sig = get_update_signature(this->optimizer_attrs.value()); tensor_guid_t weight_tensor = get_only(get_outgoing_tensors(this->computation_graph, node)); @@ -179,7 +178,8 @@ void LocalTrainingBacking::execute_update() { } } - this->optimizer_attrs = next(this->optimizer_attrs.value()); + this->optimizer_attrs = + get_next_iteration_optimizer_attrs(this->optimizer_attrs.value()); } TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor( diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc index dbbfad10fb..ccb41d7461 100644 --- a/lib/local-execution/src/ops/element_unary.cc +++ b/lib/local-execution/src/ops/element_unary.cc @@ -61,8 +61,10 @@ static DeviceSpecificDeviceStates ParallelTensorShape output_shape = throw_if_unexpected(get_output_shape(attrs, input_shape)); - ElementUnaryPerDeviceState per_device_state = init_kernel( - get_piece_shape(input_shape), get_piece_shape(output_shape), attrs); + ElementUnaryPerDeviceState per_device_state = + init_kernel(ArrayShape{get_piece_shape(input_shape)}, + ArrayShape{get_piece_shape(output_shape)}, + attrs); return DeviceSpecificDeviceStates{ DeviceSpecific::create(per_device_state)}; diff --git a/lib/local-execution/src/runtime_arg_ref.cc b/lib/local-execution/src/runtime_arg_ref.cc index 56201a5c55..1f591b4d82 100644 --- a/lib/local-execution/src/runtime_arg_ref.cc +++ b/lib/local-execution/src/runtime_arg_ref.cc @@ -3,6 +3,19 @@ namespace FlexFlow { +std::string to_string(RuntimeArgRefType const &runtime_arg_ref_type) { + switch (runtime_arg_ref_type) { + case RuntimeArgRefType::FF_HANDLE: + return "FF_HANDLE"; + case RuntimeArgRefType::PROFILING_SETTINGS: + return "PROFILING_SETTINGS"; + case RuntimeArgRefType::FF_ITERATION_CONFIG: + return "FF_ITERATION_CONFIG"; + default: + return "Unknown"; + } +} + RuntimeArgRef profiling_settings() { return {RuntimeArgRefType::PROFILING_SETTINGS}; } diff --git a/lib/local-execution/src/task_binding.cc b/lib/local-execution/src/task_binding.cc index a5a3b2dc34..45d9d0cdb9 100644 --- a/lib/local-execution/src/task_binding.cc +++ b/lib/local-execution/src/task_binding.cc @@ -1,5 +1,6 @@ #include "local-execution/task_binding.h" #include "utils/containers/contains_key.h" +#include "utils/fmt/unordered_map.h" namespace FlexFlow { @@ -41,4 +42,16 @@ std::unordered_map const & return this->arg_bindings; } +std::string format_as(TaskBinding const &x) { + std::ostringstream oss; + oss << " weights; ParallelTensorShape weights_shape = throw_if_unexpected( get_weights_shape(attrs, inputs_shape, inputs_shape, inputs_shape)); - ParallelTensorAttrs weight_attrs = - ParallelTensorAttrs{weights_shape, - /*sync_type=*/std::nullopt, - /*initializer=*/std::nullopt, - CreateGrad::YES}; + weights.push_back(make_weight_attrs(weights_shape, std::nullopt)); + ParallelTensorShape input_bias_shape = + throw_if_unexpected(get_input_bias_shape( + attrs, inputs_shape, inputs_shape, inputs_shape)); + weights.push_back(make_weight_attrs(input_bias_shape, std::nullopt)); + ParallelTensorShape output_bias_shape = + throw_if_unexpected(get_output_bias_shape( + attrs, inputs_shape, inputs_shape, inputs_shape)); + weights.push_back(make_weight_attrs(output_bias_shape, std::nullopt)); ParallelTensorShape output_shape = throw_if_unexpected( get_output_shape(attrs, inputs_shape, inputs_shape, inputs_shape)); @@ -66,7 +72,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { PCGOperatorAttrs{attrs}, std::vector{ inputs_shape, inputs_shape, inputs_shape}, - std::vector{weight_attrs}, + weights, std::vector{output_attrs}, make_1d_machine_view(gpu_id_t{0}, gpu_id_t{1})); diff --git a/lib/pcg/include/pcg/optimizer_attrs.h b/lib/pcg/include/pcg/optimizer_attrs.h index 4b78f66fe4..d4abd1b52f 100644 --- a/lib/pcg/include/pcg/optimizer_attrs.h +++ b/lib/pcg/include/pcg/optimizer_attrs.h @@ -6,7 +6,7 @@ namespace FlexFlow { -OptimizerAttrs next(OptimizerAttrs const &old); +OptimizerAttrs get_next_iteration_optimizer_attrs(OptimizerAttrs const &old); } // namespace FlexFlow diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h index 019b120936..35113553f2 100644 --- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h +++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h @@ -179,6 +179,10 @@ struct ParallelComputationGraphBuilder { ParallelComputationGraph pcg; }; +ParallelTensorAttrs + make_weight_attrs(ParallelTensorShape const &shape, + std::optional const &initializer_attrs); + } // namespace FlexFlow #endif diff --git a/lib/pcg/src/pcg/optimizer_attrs.cc b/lib/pcg/src/pcg/optimizer_attrs.cc index 5307450a68..8d66f7af7e 100644 --- a/lib/pcg/src/pcg/optimizer_attrs.cc +++ b/lib/pcg/src/pcg/optimizer_attrs.cc @@ -2,7 +2,8 @@ namespace FlexFlow { -OptimizerAttrs next(OptimizerAttrs const &old_attrs) { +OptimizerAttrs + get_next_iteration_optimizer_attrs(OptimizerAttrs const &old_attrs) { if (old_attrs.has()) { AdamOptimizerAttrs old = old_attrs.get(); double new_beta1_t = old.beta_t * old.beta1; diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc index ce00ea62f4..b56156fe8a 100644 --- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc +++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc @@ -20,7 +20,7 @@ static std::string get_default_name(PCGOperatorAttrs const &attrs) { return get_default_name(get_op_type(attrs)); } -static ParallelTensorAttrs make_weight_attrs( +ParallelTensorAttrs make_weight_attrs( ParallelTensorShape const &shape, std::optional const &initializer_attrs) { return ParallelTensorAttrs{ From 79ef4c964fa4abebf9813166353ecce230b83c75 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 22 Oct 2024 08:55:37 -0700 Subject: [PATCH 20/91] Refactor training backing and instance --- .../local-execution/local_slots_backing.h | 13 +- .../local-execution/local_training_backing.h | 26 +- .../local-execution/model_training_instance.h | 39 +++ .../model_training_instance.struct.toml | 25 -- .../include/local-execution/task_registry.h | 5 + .../src/local_cost_estimator.cc | 39 ++- .../src/local_slots_backing.cc | 64 +++-- .../src/local_training_backing.cc | 224 ++++++++---------- .../src/model_training_instance.cc | 64 +++++ lib/local-execution/src/task_registry.cc | 24 +- .../test/src/test_local_slots_backing.cc | 32 ++- lib/local-execution/test/src/test_loss_e2e.cc | 96 +++----- .../test/src/test_update_e2e.cc | 77 ++---- .../include/pcg/computation_graph_builder.h | 7 + lib/pcg/src/pcg/computation_graph_builder.cc | 14 +- 15 files changed, 402 insertions(+), 347 deletions(-) create mode 100644 lib/local-execution/include/local-execution/model_training_instance.h delete mode 100644 lib/local-execution/include/local-execution/model_training_instance.struct.toml create mode 100644 lib/local-execution/src/model_training_instance.cc diff --git a/lib/local-execution/include/local-execution/local_slots_backing.h b/lib/local-execution/include/local-execution/local_slots_backing.h index d201d3c405..46e66e97a2 100644 --- a/lib/local-execution/include/local-execution/local_slots_backing.h +++ b/lib/local-execution/include/local-execution/local_slots_backing.h @@ -9,6 +9,7 @@ #include "local-execution/per_device_op_state.h" #include "local-execution/runtime_arg_config.h" #include "local-execution/task_invocation.dtg.h" +#include "local-execution/tensor_role.dtg.h" #include "local-execution/unified_tensor_guid.dtg.h" #include "pcg/computation_graph.dtg.h" #include "pcg/layer_guid_t.dtg.h" @@ -27,9 +28,15 @@ struct LocalSlotsBacking { public: void add_per_device_op_state(layer_guid_t const &, DeviceSpecificDeviceStates const &); - void allocate_outgoing_tensors(layer_guid_t const &, - ComputationGraph const &, - Allocator &); + void insert_into_tensor_mapping(tensor_guid_t const &, + GenericTensorAccessorW const &); + void allocate_layer_tensors(layer_guid_t const &, + ComputationGraph const &, + Allocator &); + void allocate_tensors_by_role(TensorRole const &, + layer_guid_t const &, + ComputationGraph const &, + Allocator &); void allocate_optimizer_tensors(layer_guid_t const &weight_layer, tensor_guid_t const &, ComputationGraph const &, diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h index 2313d55732..6dfa8ad443 100644 --- a/lib/local-execution/include/local-execution/local_training_backing.h +++ b/lib/local-execution/include/local-execution/local_training_backing.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TRAINING_BACKING_H #include "local-execution/local_slots_backing.h" -#include "local-execution/model_training_instance.dtg.h" #include "local-execution/task_registry.h" +#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" #include "pcg/computation_graph.dtg.h" #include "pcg/optimizer_attrs.dtg.h" @@ -16,19 +16,25 @@ struct LocalTrainingBacking { LocalTrainingBacking(Allocator const &, ComputationGraph const &, TensorBackingMap const &, - RuntimeArgConfig const &, - std::optional const &, - std::optional const &); - - void execute_init(); - PerLayerElapsedTime execute_forward(); - PerLayerElapsedTime execute_backward(); - void execute_update(); + RuntimeArgConfig const &); + void register_and_allocate_layer(layer_guid_t const &); + void allocate_layer_optimizer_tensors(layer_guid_t const &, + OptimizerAttrs const &); + + void execute_init(layer_guid_t const &); + std::optional execute_forward(layer_guid_t const &); + void compute_loss(LossAttrs const &loss_attrs, + tensor_guid_t const &logit_tensor, + tensor_guid_t const &label_tensor); + std::optional execute_backward(layer_guid_t const &); + void execute_update(layer_guid_t const &, OptimizerAttrs const &); TaskArgumentAccessor get_task_arg_accessor(TaskInvocation const &) const; TaskArgumentAccessor get_op_task_arg_accessor(OpTaskInvocation const &, layer_guid_t const &) const; + void insert_tensor(tensor_guid_t const &, GenericTensorAccessorW const &); + private: DeviceSpecificDeviceStates call_init_task_impl(task_id_t, TaskArgumentAccessor const &); @@ -39,8 +45,6 @@ struct LocalTrainingBacking { ComputationGraph computation_graph; TaskRegistry task_registry; LocalSlotsBacking local_slots_backing; - std::optional training_instance; - std::optional optimizer_attrs; }; } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h new file mode 100644 index 0000000000..08f373a16f --- /dev/null +++ b/lib/local-execution/include/local-execution/model_training_instance.h @@ -0,0 +1,39 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_MODEL_TRAINING_INSTANCE_H +#define _FLEXFLOW_LOCAL_EXECUTION_MODEL_TRAINING_INSTANCE_H + +#include "local-execution/local_training_backing.h" +#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" + +namespace FlexFlow { + +using PerLayerElapsedTime = + std::unordered_map>; + +struct ModelTrainingInstance { + ModelTrainingInstance(Allocator const &, + ComputationGraph const &, + TensorBackingMap const &, + RuntimeArgConfig const &, + LossAttrs const &, + tensor_guid_t const & logit_tensor, + tensor_guid_t const & label_tensor, + OptimizerAttrs const &); + + void register_and_allocate_layers(); + void allocate_optimizer_tensors(); + void execute_init(); + PerLayerElapsedTime execute_forward(); + PerLayerElapsedTime execute_backward(); + void execute_update(); + + ComputationGraph computation_graph; + LocalTrainingBacking training_backing; + LossAttrs loss_attrs; + tensor_guid_t logit_tensor; + tensor_guid_t label_tensor; + OptimizerAttrs optimizer_attrs; +}; + +} + +#endif diff --git a/lib/local-execution/include/local-execution/model_training_instance.struct.toml b/lib/local-execution/include/local-execution/model_training_instance.struct.toml deleted file mode 100644 index dcfaf2175d..0000000000 --- a/lib/local-execution/include/local-execution/model_training_instance.struct.toml +++ /dev/null @@ -1,25 +0,0 @@ -namespace = "FlexFlow" -name = "ModelTrainingInstance" -features = [ - "eq", - "ord", - "hash", - "fmt", -] - -includes = [ - "op-attrs/ops/loss_functions/loss_attrs.dtg.h", - "pcg/tensor_guid_t.dtg.h" -] - -[[fields]] -name = "loss_attrs" -type = "::FlexFlow::LossAttrs" - -[[fields]] -name = "label_tensor" -type = "::FlexFlow::tensor_guid_t" - -[[fields]] -name = "logit_tensor" -type = "::FlexFlow::tensor_guid_t" diff --git a/lib/local-execution/include/local-execution/task_registry.h b/lib/local-execution/include/local-execution/task_registry.h index e00cc183da..24790a28e3 100644 --- a/lib/local-execution/include/local-execution/task_registry.h +++ b/lib/local-execution/include/local-execution/task_registry.h @@ -2,6 +2,7 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_REGISTRY_H #define _FLEXFLOW_LOCAL_EXECUTION_TASK_REGISTRY_H +#include "local-execution/op_task_type.dtg.h" #include "local-execution/task_registry.dtg.h" #include "op-attrs/computation_graph_op_attrs.h" @@ -13,6 +14,10 @@ void register_tasks_for_layer(TaskRegistry &, layer_guid_t const &, ComputationGraphOpAttrs const &attrs); +bool registry_contains_op_task(TaskRegistry const &, + layer_guid_t const &, + OpTaskType const &); + } // namespace FlexFlow #endif diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc index 186c2d516a..c99a2b154f 100644 --- a/lib/local-execution/src/local_cost_estimator.cc +++ b/lib/local-execution/src/local_cost_estimator.cc @@ -4,6 +4,7 @@ #include "local-execution/tracked_allocator.h" #include "op-attrs/computation_graph_op_attrs.h" #include "op-attrs/pcg_operator_attrs.h" +#include "pcg/computation_graph/layer_added_result.dtg.h" #include "pcg/computation_graph_builder.h" #include "pcg/parallel_tensor_attrs.h" #include "utils/containers/transform.h" @@ -66,29 +67,27 @@ CostDetails LocalCostEstimator::estimate_cost( }; // add operator to graph - std::vector output_tensor_ids = - cg_builder.add_layer(layer_attrs, - input_tensor_ids, - transform(get_vector_piece_attrs(weights), - [&](TensorAttrs const &a) { - return cg_builder.create_weight(a); - }), - get_vector_piece_attrs(outputs)); - - std::optional model_training_instance = std::nullopt; - std::optional optimizer_attrs = std::nullopt; + LayerAddedResult layer_added_result = + cg_builder.add_layer_and_get_layer_added_result( + layer_attrs, + input_tensor_ids, + transform(get_vector_piece_attrs(weights), + [&](TensorAttrs const &a) { + return cg_builder.create_weight(a); + }), + get_vector_piece_attrs(outputs)); + LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, tensor_backing_map, - this->runtime_arg_config, - model_training_instance, - optimizer_attrs); - - local_backing.execute_init(); - PerLayerElapsedTime fwd = local_backing.execute_forward(); - PerLayerElapsedTime bwd = local_backing.execute_backward(); - - return CostDetails{get_total_elapsed_time(fwd, bwd), + this->runtime_arg_config); + local_backing.register_and_allocate_layer(layer_added_result.layer); + local_backing.execute_init(layer_added_result.layer); + float fwd = local_backing.execute_forward(layer_added_result.layer).value(); + float bwd = local_backing.execute_backward(layer_added_result.layer).value(); + float total_execution_time = fwd + bwd; + + return CostDetails{total_execution_time, tracked_allocator_ptr->get_current_mem_usage()}; } diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc index f10b7c0126..25abc72567 100644 --- a/lib/local-execution/src/local_slots_backing.cc +++ b/lib/local-execution/src/local_slots_backing.cc @@ -18,39 +18,65 @@ void LocalSlotsBacking::add_per_device_op_state( this->per_device_op_states.insert({op_guid, device_state}); } -void LocalSlotsBacking::allocate_outgoing_tensors( +void LocalSlotsBacking::insert_into_tensor_mapping( + tensor_guid_t const &tensor, GenericTensorAccessorW const &tensor_backing) { + if (!contains_key(this->tensor_mapping, tensor)) { + this->tensor_mapping.insert({tensor, tensor_backing}); + } +} + +void LocalSlotsBacking::allocate_layer_tensors( layer_guid_t const &layer_guid, ComputationGraph const &computation_graph, Allocator &allocator) { - std::vector incoming_input_tensors = - get_incoming_inputs(computation_graph, layer_guid); - std::vector incoming_weight_tensors = - get_incoming_weights(computation_graph, layer_guid); - std::vector outgoing_tensors = - get_outgoing_tensors(computation_graph, layer_guid); - for (tensor_guid_t const &output_tensor : outgoing_tensors) { - TensorAttrs tensor_attrs = - get_tensor_attrs(computation_graph, output_tensor); + this->allocate_tensors_by_role( + TensorRole::INPUT, layer_guid, computation_graph, allocator); + this->allocate_tensors_by_role( + TensorRole::WEIGHT, layer_guid, computation_graph, allocator); + this->allocate_tensors_by_role( + TensorRole::OUTPUT, layer_guid, computation_graph, allocator); +} + +void LocalSlotsBacking::allocate_tensors_by_role( + TensorRole const &role, + layer_guid_t const &layer_guid, + ComputationGraph const &computation_graph, + Allocator &allocator) { + std::vector tensors; + switch (role) { + case TensorRole::INPUT: + tensors = get_incoming_inputs(computation_graph, layer_guid); + this->input_tensor_slots.insert({layer_guid, tensors}); + break; + case TensorRole::WEIGHT: + tensors = get_incoming_weights(computation_graph, layer_guid); + this->weight_tensor_slots.insert({layer_guid, tensors}); + break; + case TensorRole::OUTPUT: + tensors = get_outgoing_tensors(computation_graph, layer_guid); + this->output_tensor_slots.insert({layer_guid, tensors}); + break; + default: + throw mk_runtime_error("Invalid tensor role, got {}", role); + } + + for (tensor_guid_t const &tensor : tensors) { + TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor); // tensor allocation - if (!is_tensor_allocated(output_tensor)) { + if (!is_tensor_allocated(tensor)) { GenericTensorAccessorW tensor_backing = allocator.allocate_tensor(tensor_attrs.shape); - this->tensor_mapping.insert({output_tensor, tensor_backing}); + this->tensor_mapping.insert({tensor, tensor_backing}); } // gradient tensor allocation if (tensor_attrs.create_gradients == CreateGrad::YES && - !is_gradient_tensor_allocated(output_tensor)) { + !is_gradient_tensor_allocated(tensor)) { GenericTensorAccessorW gradient_tensor_backing = allocator.allocate_tensor(tensor_attrs.shape); - this->gradient_tensor_mapping.insert( - {output_tensor, gradient_tensor_backing}); + this->gradient_tensor_mapping.insert({tensor, gradient_tensor_backing}); } } - - this->input_tensor_slots.insert({layer_guid, incoming_input_tensors}); - this->weight_tensor_slots.insert({layer_guid, incoming_weight_tensors}); - this->output_tensor_slots.insert({layer_guid, outgoing_tensors}); } void LocalSlotsBacking::allocate_optimizer_tensors( diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index b7631470b7..0cb8146467 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -8,7 +8,6 @@ #include "utils/containers/contains.h" #include "utils/containers/contains_key.h" #include "utils/containers/get_only.h" -#include "utils/containers/reversed.h" #include "utils/exception.h" namespace FlexFlow { @@ -17,42 +16,30 @@ LocalTrainingBacking::LocalTrainingBacking( Allocator const &allocator, ComputationGraph const &computation_graph, TensorBackingMap const &tensor_backing_mapping, - RuntimeArgConfig const &runtime_arg_config, - std::optional const &training_instance, - std::optional const &optimizer_attrs) + RuntimeArgConfig const &runtime_arg_config) : allocator(allocator), computation_graph(computation_graph), local_slots_backing(tensor_backing_mapping, runtime_arg_config), - task_registry(empty_task_registry()), - training_instance(training_instance), optimizer_attrs(optimizer_attrs) { - - for (layer_guid_t const &node : - topological_ordering(this->computation_graph)) { - ComputationGraphOpAttrs attrs = - get_layer_attrs(this->computation_graph, node).attrs; - - // allocate outgoing tensors - this->local_slots_backing.allocate_outgoing_tensors( - node, this->computation_graph, this->allocator); - - // register tasks - register_tasks_for_layer(this->task_registry, node, attrs); - - // allocate optimizer buffers - if (attrs.has() && this->training_instance.has_value()) { - TaskSignature sig = get_update_signature(this->optimizer_attrs.value()); - tensor_guid_t weight_tensor = - get_only(get_outgoing_tensors(this->computation_graph, node)); - this->local_slots_backing.allocate_optimizer_tensors( - node, weight_tensor, this->computation_graph, this->allocator, sig); - } - } + task_registry(empty_task_registry()) {} + +void LocalTrainingBacking::register_and_allocate_layer( + layer_guid_t const &node) { + ComputationGraphOpAttrs attrs = + get_layer_attrs(this->computation_graph, node).attrs; + this->local_slots_backing.allocate_layer_tensors( + node, this->computation_graph, this->allocator); + register_tasks_for_layer(this->task_registry, node, attrs); +} - if (this->training_instance.has_value()) { - // label and logit tensor should be allocated - assert(this->local_slots_backing.is_tensor_allocated( - this->training_instance.value().label_tensor)); - assert(this->local_slots_backing.is_tensor_allocated( - this->training_instance.value().logit_tensor)); +void LocalTrainingBacking::allocate_layer_optimizer_tensors( + layer_guid_t const &node, OptimizerAttrs const &optimizer_attrs) { + ComputationGraphOpAttrs attrs = + get_layer_attrs(this->computation_graph, node).attrs; + if (attrs.has()) { + TaskSignature sig = get_update_signature(optimizer_attrs); + tensor_guid_t weight_tensor = + get_only(get_outgoing_tensors(this->computation_graph, node)); + this->local_slots_backing.allocate_optimizer_tensors( + node, weight_tensor, this->computation_graph, this->allocator, sig); } } @@ -76,110 +63,88 @@ std::optional return fn(acc); } -void LocalTrainingBacking::execute_init() { - for (layer_guid_t const &operator_node : - topological_ordering(this->computation_graph)) { - if (this->task_registry.init_task_ids.at(operator_node).has_value()) { - ComputationGraphOpAttrs attrs = - get_layer_attrs(this->computation_graph, operator_node).attrs; - - OpTaskInvocation invocation = init(attrs); - TaskArgumentAccessor accessor = - this->get_op_task_arg_accessor(invocation, operator_node); - DeviceSpecificDeviceStates device_state = - this->call_init_task_impl(invocation.task_id, accessor); - this->local_slots_backing.add_per_device_op_state(operator_node, - device_state); - } +void LocalTrainingBacking::execute_init(layer_guid_t const &operator_node) { + if (registry_contains_op_task( + this->task_registry, operator_node, OpTaskType::INIT)) { + ComputationGraphOpAttrs attrs = + get_layer_attrs(this->computation_graph, operator_node).attrs; + + OpTaskInvocation invocation = init(attrs); + TaskArgumentAccessor accessor = + this->get_op_task_arg_accessor(invocation, operator_node); + DeviceSpecificDeviceStates device_state = + this->call_init_task_impl(invocation.task_id, accessor); + this->local_slots_backing.add_per_device_op_state(operator_node, + device_state); } } -PerLayerElapsedTime LocalTrainingBacking::execute_forward() { - PerLayerElapsedTime per_op_elapsed_time; - - for (layer_guid_t const &operator_node : - topological_ordering(this->computation_graph)) { - if (this->task_registry.forward_task_ids.at(operator_node).has_value()) { - ComputationGraphOpAttrs attrs = - get_layer_attrs(this->computation_graph, operator_node).attrs; - - OpTaskInvocation invocation = forward(attrs); - TaskArgumentAccessor accessor = - this->get_op_task_arg_accessor(invocation, operator_node); - std::optional elapsed_time = - this->call_task_impl(invocation.task_id, accessor); - per_op_elapsed_time.insert({operator_node, elapsed_time}); - } +std::optional + LocalTrainingBacking::execute_forward(layer_guid_t const &operator_node) { + if (registry_contains_op_task( + this->task_registry, operator_node, OpTaskType::FWD)) { + ComputationGraphOpAttrs attrs = + get_layer_attrs(this->computation_graph, operator_node).attrs; + + OpTaskInvocation invocation = forward(attrs); + TaskArgumentAccessor accessor = + this->get_op_task_arg_accessor(invocation, operator_node); + return this->call_task_impl(invocation.task_id, accessor); + } else { + return std::nullopt; } - - return per_op_elapsed_time; } -PerLayerElapsedTime LocalTrainingBacking::execute_backward() { - PerLayerElapsedTime per_op_elapsed_time; - - // compute loss - if (this->training_instance.has_value()) { - ModelTrainingInstance unwrapped_training_instance = - training_instance.value(); - TaskInvocation loss_invocation = - backward(unwrapped_training_instance.loss_attrs, - unwrapped_training_instance.logit_tensor, - unwrapped_training_instance.label_tensor); - // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation)); - TaskArgumentAccessor loss_accessor = - this->get_task_arg_accessor(loss_invocation); - TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl(); - loss_impl_fn.get().function_ptr(loss_accessor); - } +void LocalTrainingBacking::compute_loss(LossAttrs const &loss_attrs, + tensor_guid_t const &logit_tensor, + tensor_guid_t const &label_tensor) { + assert(this->local_slots_backing.is_tensor_allocated(logit_tensor) && + this->local_slots_backing.is_tensor_allocated(label_tensor)); + TaskInvocation loss_invocation = + backward(loss_attrs, logit_tensor, label_tensor); + // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation)); + TaskArgumentAccessor loss_accessor = + this->get_task_arg_accessor(loss_invocation); + TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl(); + loss_impl_fn.get().function_ptr(loss_accessor); +} - // backward through computation graph - for (layer_guid_t const &operator_node : - reversed(topological_ordering(this->computation_graph))) { - if (this->task_registry.backward_task_ids.at(operator_node).has_value()) { - ComputationGraphOpAttrs attrs = - get_layer_attrs(this->computation_graph, operator_node).attrs; - - OpTaskInvocation invocation = backward(attrs); - TaskArgumentAccessor accessor = - this->get_op_task_arg_accessor(invocation, operator_node); - std::optional elapsed_time = - this->call_task_impl(invocation.task_id, accessor); - per_op_elapsed_time.insert({operator_node, elapsed_time}); - } +std::optional + LocalTrainingBacking::execute_backward(layer_guid_t const &operator_node) { + if (registry_contains_op_task( + this->task_registry, operator_node, OpTaskType::BWD)) { + ComputationGraphOpAttrs attrs = + get_layer_attrs(this->computation_graph, operator_node).attrs; + + OpTaskInvocation invocation = backward(attrs); + TaskArgumentAccessor accessor = + this->get_op_task_arg_accessor(invocation, operator_node); + return this->call_task_impl(invocation.task_id, accessor); + } else { + return std::nullopt; } - return per_op_elapsed_time; } -void LocalTrainingBacking::execute_update() { - assert(this->training_instance.has_value()); - assert(this->optimizer_attrs.has_value()); - - for (layer_guid_t const &node : - topological_ordering(this->computation_graph)) { - LayerAttrs layer_attrs = get_layer_attrs(this->computation_graph, node); - if (layer_attrs.attrs.has()) { - // get tensors - tensor_guid_t weight_tensor = - get_only(get_outgoing_tensors(this->computation_graph, node)); - std::vector grad_buffer_tensors = - this->local_slots_backing.weight_optimizer_tensor_guids.at(node); - - // get invocation - TaskInvocation invocation = get_update_invocation( - this->optimizer_attrs.value(), weight_tensor, grad_buffer_tensors); - // assert(is_invocation_valid(get_update_signature(attrs), invocation)); - - // execute update - TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation); - TaskImplFunction update_impl_fn = - get_update_task_impl(this->optimizer_attrs.value()); - update_impl_fn.get().function_ptr(accessor); - } +void LocalTrainingBacking::execute_update( + layer_guid_t const &node, OptimizerAttrs const &optimizer_attrs) { + LayerAttrs layer_attrs = get_layer_attrs(this->computation_graph, node); + if (layer_attrs.attrs.has()) { + // get tensors + tensor_guid_t weight_tensor = + get_only(get_outgoing_tensors(this->computation_graph, node)); + std::vector grad_buffer_tensors = + this->local_slots_backing.weight_optimizer_tensor_guids.at(node); + + // get invocation + TaskInvocation invocation = get_update_invocation( + optimizer_attrs, weight_tensor, grad_buffer_tensors); + // assert(is_invocation_valid(get_update_signature(attrs), invocation)); + + // execute update + TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation); + TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs); + update_impl_fn.get().function_ptr(accessor); } - - this->optimizer_attrs = - get_next_iteration_optimizer_attrs(this->optimizer_attrs.value()); } TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor( @@ -206,4 +171,9 @@ TaskArgumentAccessor LocalTrainingBacking::get_op_task_arg_accessor( this->allocator, tensor_slots_backing, arg_slots_backing); } +void LocalTrainingBacking::insert_tensor( + tensor_guid_t const &tensor, GenericTensorAccessorW const &tensor_backing) { + this->local_slots_backing.insert_into_tensor_mapping(tensor, tensor_backing); +} + } // namespace FlexFlow diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc new file mode 100644 index 0000000000..7256a82478 --- /dev/null +++ b/lib/local-execution/src/model_training_instance.cc @@ -0,0 +1,64 @@ +#include "local-execution/model_training_instance.h" +#include "pcg/computation_graph.h" +#include "utils/containers/reversed.h" +#include "pcg/optimizer_attrs.h" + +namespace FlexFlow { + +ModelTrainingInstance::ModelTrainingInstance(Allocator const & allocator, + ComputationGraph const & computation_graph, + TensorBackingMap const & tensor_backing_map, + RuntimeArgConfig const & runtime_arg_config, + LossAttrs const & loss_attrs, + tensor_guid_t const &logit_tensor, + tensor_guid_t const &label_tensor, + OptimizerAttrs const & optimizer_attrs) + : computation_graph(computation_graph), training_backing(allocator, computation_graph, tensor_backing_map, runtime_arg_config), + loss_attrs(loss_attrs), logit_tensor(logit_tensor), label_tensor(label_tensor), optimizer_attrs(optimizer_attrs) {} + +void ModelTrainingInstance::register_and_allocate_layers() { + for (layer_guid_t const & node: topological_ordering(this->computation_graph)) { + this->training_backing.register_and_allocate_layer(node); + } +} + +void ModelTrainingInstance::allocate_optimizer_tensors() { + for (layer_guid_t const & node: topological_ordering(this->computation_graph)) { + this->training_backing.allocate_layer_optimizer_tensors(node, this->optimizer_attrs); + } +} + +void ModelTrainingInstance::execute_init() { + for (layer_guid_t const & node: topological_ordering(this->computation_graph)) { + this->training_backing.execute_init(node); + } +} + +PerLayerElapsedTime ModelTrainingInstance::execute_forward() { + PerLayerElapsedTime per_layer_elapsed_time; + for (layer_guid_t const & node: topological_ordering(this->computation_graph)) { + std::optional elapsed_time = this->training_backing.execute_forward(node); + per_layer_elapsed_time.insert({node, elapsed_time}); + } + return per_layer_elapsed_time; +} + +PerLayerElapsedTime ModelTrainingInstance::execute_backward() { + this->training_backing.compute_loss(this->loss_attrs, this->logit_tensor, this->label_tensor); + + PerLayerElapsedTime per_layer_elapsed_time; + for (layer_guid_t const & node: reversed(topological_ordering(this->computation_graph))) { + std::optional elapsed_time = this->training_backing.execute_backward(node); + per_layer_elapsed_time.insert({node, elapsed_time}); + } + return per_layer_elapsed_time; +} + +void ModelTrainingInstance::execute_update() { + for (layer_guid_t const & node: topological_ordering(this->computation_graph)) { + this->training_backing.execute_update(node, this->optimizer_attrs); + } + this->optimizer_attrs = get_next_iteration_optimizer_attrs(this->optimizer_attrs); +} + +} diff --git a/lib/local-execution/src/task_registry.cc b/lib/local-execution/src/task_registry.cc index dad5c1fc69..3cd2cccae8 100644 --- a/lib/local-execution/src/task_registry.cc +++ b/lib/local-execution/src/task_registry.cc @@ -35,10 +35,32 @@ void register_tasks_for_layer(TaskRegistry &task_registry, task_registry.backward_task_ids[op_id] = task_id; break; default: - throw mk_runtime_error("Invalid OpTaskType"); + throw mk_runtime_error("Invalid OpTaskType, got {}", + task_signature_impl.task_signature.type); } task_registry.task_mapping.insert({task_id, task_signature_impl}); } } +bool registry_contains_op_task(TaskRegistry const &task_registry, + layer_guid_t const &op, + OpTaskType const &op_task_type) { + std::unordered_map> task_ids; + switch (op_task_type) { + case OpTaskType::INIT: + task_ids = task_registry.init_task_ids; + break; + case OpTaskType::FWD: + task_ids = task_registry.forward_task_ids; + break; + case OpTaskType::BWD: + task_ids = task_registry.backward_task_ids; + break; + default: + throw mk_runtime_error("Invalid OpTaskType, got {}", op_task_type); + } + + return task_ids.at(op).has_value(); +} + } // namespace FlexFlow diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc index 779ba43f26..5d58e7e757 100644 --- a/lib/local-execution/test/src/test_local_slots_backing.cc +++ b/lib/local-execution/test/src/test_local_slots_backing.cc @@ -81,7 +81,7 @@ TEST_SUITE(FF_TEST_SUITE) { LocalSlotsBacking local_slots_backing = {tensor_backing_map, runtime_arg_config}; - SUBCASE("LocalSlotsBacking::allocate_outgoing_tensors") { + SUBCASE("LocalSlotsBacking::allocate_tensors_by_role") { auto get_result_shape_and_dtype_for_tensor_guid_and_map = [&](tensor_guid_t t, TensorBackingMap m) -> std::pair { @@ -92,14 +92,11 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("Input (QKV) and gradient tensors allocation") { // allocate all tensors from input nodes - for (layer_guid_t const &node : - topological_ordering(cg_builder.computation_graph)) { - if (node == layer_guid) { - break; - } - local_slots_backing.allocate_outgoing_tensors( - node, cg_builder.computation_graph, allocator); - } + local_slots_backing.allocate_tensors_by_role( + TensorRole::INPUT, + layer_guid, + cg_builder.computation_graph, + allocator); SUBCASE("Query grad") { std::pair result = @@ -127,8 +124,11 @@ TEST_SUITE(FF_TEST_SUITE) { } } SUBCASE("Output and gradient tensors allocation") { - local_slots_backing.allocate_outgoing_tensors( - layer_guid, cg_builder.computation_graph, allocator); + local_slots_backing.allocate_tensors_by_role( + TensorRole::OUTPUT, + layer_guid, + cg_builder.computation_graph, + allocator); SUBCASE("Output") { std::pair result = get_result_shape_and_dtype_for_tensor_guid_and_map( @@ -154,7 +154,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("Tensor slots") { - local_slots_backing.allocate_outgoing_tensors( + local_slots_backing.allocate_layer_tensors( layer_guid, cg_builder.computation_graph, allocator); SUBCASE("Input tensor slots") { std::vector correct_incoming_input_tensors = @@ -211,12 +211,8 @@ TEST_SUITE(FF_TEST_SUITE) { return b; }(); - // allocate all incoming and outgoing tensors for graph - for (layer_guid_t const &node : - topological_ordering(cg_builder.computation_graph)) { - local_slots_backing.allocate_outgoing_tensors( - node, cg_builder.computation_graph, allocator); - } + local_slots_backing.allocate_layer_tensors( + layer_guid, cg_builder.computation_graph, allocator); SUBCASE("LocalSlotsBacking::construct_tensor_slots_backing") { TensorSlotsBackingWithoutAddresses result = diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc index 37024adc26..c4662d624c 100644 --- a/lib/local-execution/test/src/test_loss_e2e.cc +++ b/lib/local-execution/test/src/test_loss_e2e.cc @@ -3,6 +3,8 @@ #include "kernels/managed_ff_stream.h" #include "kernels/managed_per_device_ff_handle.h" #include "local-execution/local_training_backing.h" +#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" +#include "pcg/computation_graph.h" #include "pcg/computation_graph_builder.h" #include "pcg/optimizer_attrs.dtg.h" #include "test_utils.h" @@ -19,12 +21,6 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { EnableProfiling::YES, ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}}; - OptimizerAttrs optimizer_attrs = - OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.0, - /*momentum=*/0.0, - /*nesterov=*/false, - /*weight_decay=*/0.0}}; - // construct graph ComputationGraphBuilder cg_builder; @@ -36,8 +32,9 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { cg_builder.create_input(input_shape, CreateGrad::YES); float scalar = 4.0; + std::string layer_name = "scalar multiply"; tensor_guid_t logit_tensor = - cg_builder.scalar_multiply(input_tensor, scalar); + cg_builder.scalar_multiply(input_tensor, scalar, layer_name); // allocate memory Allocator allocator = create_local_cuda_memory_allocator(); @@ -46,6 +43,17 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { allocator.allocate_tensor(input_shape); tensor_backing_map.insert({input_tensor, input_backing}); + LocalTrainingBacking local_backing(allocator, + cg_builder.computation_graph, + tensor_backing_map, + runtime_arg_config); + // for (layer_guid_t const & node: + // topological_ordering(cg_builder.computation_graph)) { + // local_backing.register_and_allocate_layer(node); + // } + local_backing.register_and_allocate_layer( + get_layer_by_name(cg_builder.computation_graph, layer_name)); + SUBCASE("SparseCategoricalCrossEntropyLossAttrs") { TensorShape label_shape = TensorShape{ TensorDims{FFOrdered{batch_size, 1}}, DataType::FLOAT}; @@ -53,22 +61,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { cg_builder.create_input(label_shape, CreateGrad::NO); GenericTensorAccessorW label_backing = allocator.allocate_tensor(label_shape); - tensor_backing_map.insert({label_tensor, label_backing}); - std::optional model_training_instance = - ModelTrainingInstance{ - LossAttrs{SparseCategoricalCrossEntropyLossAttrs{ - /*replace_labels=*/false}}, - label_tensor, - logit_tensor}; - LocalTrainingBacking local_backing(allocator, - cg_builder.computation_graph, - tensor_backing_map, - runtime_arg_config, - model_training_instance, - optimizer_attrs); - local_backing.execute_init(); - local_backing.execute_forward(); - local_backing.execute_backward(); + local_backing.insert_tensor(label_tensor, label_backing); + LossAttrs loss_attrs = LossAttrs{ + SparseCategoricalCrossEntropyLossAttrs{/*replace_labels=*/false}}; + local_backing.compute_loss(loss_attrs, logit_tensor, label_tensor); } SUBCASE("NonconfigurableLossAttrs") { @@ -76,58 +72,24 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { cg_builder.create_input(input_shape, CreateGrad::NO); GenericTensorAccessorW label_backing = allocator.allocate_tensor(input_shape); - tensor_backing_map.insert({label_tensor, label_backing}); + local_backing.insert_tensor(label_tensor, label_backing); SUBCASE("LossFunction::CATEGORICAL_CROSSENTROPY") { - std::optional model_training_instance = - ModelTrainingInstance{LossAttrs{NonconfigurableLossAttrs{ - LossFunction::CATEGORICAL_CROSSENTROPY}}, - label_tensor, - logit_tensor}; - LocalTrainingBacking local_backing(allocator, - cg_builder.computation_graph, - tensor_backing_map, - runtime_arg_config, - model_training_instance, - optimizer_attrs); - local_backing.execute_init(); - local_backing.execute_forward(); - local_backing.execute_backward(); + LossAttrs loss_attrs = LossAttrs{ + NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}; + local_backing.compute_loss(loss_attrs, logit_tensor, label_tensor); } SUBCASE("LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE") { - std::optional model_training_instance = - ModelTrainingInstance{ - LossAttrs{NonconfigurableLossAttrs{ - LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, - label_tensor, - logit_tensor}; - LocalTrainingBacking local_backing(allocator, - cg_builder.computation_graph, - tensor_backing_map, - runtime_arg_config, - model_training_instance, - optimizer_attrs); - local_backing.execute_init(); - local_backing.execute_forward(); - local_backing.execute_backward(); + LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{ + LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}; + local_backing.compute_loss(loss_attrs, logit_tensor, label_tensor); } SUBCASE("LossFunction::IDENTITY") { - std::optional model_training_instance = - ModelTrainingInstance{ - LossAttrs{NonconfigurableLossAttrs{LossFunction::IDENTITY}}, - label_tensor, - logit_tensor}; - LocalTrainingBacking local_backing(allocator, - cg_builder.computation_graph, - tensor_backing_map, - runtime_arg_config, - model_training_instance, - optimizer_attrs); - local_backing.execute_init(); - local_backing.execute_forward(); - local_backing.execute_backward(); + LossAttrs loss_attrs = + LossAttrs{NonconfigurableLossAttrs{LossFunction::IDENTITY}}; + local_backing.compute_loss(loss_attrs, logit_tensor, label_tensor); } } } diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc index 96b748806f..b48214d89d 100644 --- a/lib/local-execution/test/src/test_update_e2e.cc +++ b/lib/local-execution/test/src/test_update_e2e.cc @@ -3,6 +3,7 @@ #include "kernels/managed_ff_stream.h" #include "kernels/managed_per_device_ff_handle.h" #include "local-execution/local_training_backing.h" +#include "pcg/computation_graph.h" #include "pcg/computation_graph_builder.h" #include "pcg/optimizer_attrs.dtg.h" #include "test_utils.h" @@ -30,8 +31,9 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { cg_builder.create_input(input_shape, CreateGrad::YES); float scalar = 4.0; + std::string layer_name = "scalar_multiply"; tensor_guid_t logit_tensor = - cg_builder.scalar_multiply(input_tensor, scalar); + cg_builder.scalar_multiply(input_tensor, scalar, layer_name); // allocate memory Allocator allocator = create_local_cuda_memory_allocator(); @@ -40,11 +42,17 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { allocator.allocate_tensor(input_shape); tensor_backing_map.insert({input_tensor, input_backing}); - tensor_guid_t label_tensor = - cg_builder.create_input(input_shape, CreateGrad::NO); - GenericTensorAccessorW label_backing = - allocator.allocate_tensor(input_shape); - tensor_backing_map.insert({label_tensor, label_backing}); + LocalTrainingBacking local_backing(allocator, + cg_builder.computation_graph, + tensor_backing_map, + runtime_arg_config); + // for (layer_guid_t const & node: + // topological_ordering(cg_builder.computation_graph)) { + // local_backing.register_and_allocate_layer(node); + // } + layer_guid_t layer_guid = + get_layer_by_name(cg_builder.computation_graph, layer_name); + local_backing.register_and_allocate_layer(layer_guid); SUBCASE("SGDOptimizerAttrs") { SUBCASE("momentum=0") { @@ -53,22 +61,9 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { /*momentum=*/0.0f, /*nesterov=*/false, /*weight_decay=*/0.001}}; - std::optional model_training_instance = - ModelTrainingInstance{ - LossAttrs{NonconfigurableLossAttrs{ - LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, - label_tensor, - logit_tensor}; - LocalTrainingBacking local_backing(allocator, - cg_builder.computation_graph, - tensor_backing_map, - runtime_arg_config, - model_training_instance, - optimizer_attrs); - local_backing.execute_init(); - local_backing.execute_forward(); - local_backing.execute_backward(); - local_backing.execute_update(); + local_backing.allocate_layer_optimizer_tensors(layer_guid, + optimizer_attrs); + local_backing.execute_update(layer_guid, optimizer_attrs); } SUBCASE("momentum=0.9") { OptimizerAttrs optimizer_attrs = @@ -76,22 +71,9 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { /*momentum=*/0.9, /*nesterov=*/false, /*weight_decay=*/0.001}}; - std::optional model_training_instance = - ModelTrainingInstance{ - LossAttrs{NonconfigurableLossAttrs{ - LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, - label_tensor, - logit_tensor}; - LocalTrainingBacking local_backing(allocator, - cg_builder.computation_graph, - tensor_backing_map, - runtime_arg_config, - model_training_instance, - optimizer_attrs); - local_backing.execute_init(); - local_backing.execute_forward(); - local_backing.execute_backward(); - local_backing.execute_update(); + local_backing.allocate_layer_optimizer_tensors(layer_guid, + optimizer_attrs); + local_backing.execute_update(layer_guid, optimizer_attrs); } } SUBCASE("AdamOptimizerAttrs") { @@ -104,22 +86,9 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { /*beta_t=*/0.9, /*beta2_t=*/0.999, /*epsilon=*/1e-8}}; - std::optional model_training_instance = - ModelTrainingInstance{ - LossAttrs{NonconfigurableLossAttrs{ - LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}, - label_tensor, - logit_tensor}; - LocalTrainingBacking local_backing(allocator, - cg_builder.computation_graph, - tensor_backing_map, - runtime_arg_config, - model_training_instance, - optimizer_attrs); - local_backing.execute_init(); - local_backing.execute_forward(); - local_backing.execute_backward(); - local_backing.execute_update(); + local_backing.allocate_layer_optimizer_tensors(layer_guid, + optimizer_attrs); + local_backing.execute_update(layer_guid, optimizer_attrs); } } } diff --git a/lib/pcg/include/pcg/computation_graph_builder.h b/lib/pcg/include/pcg/computation_graph_builder.h index 45cde0de57..585399ea1d 100644 --- a/lib/pcg/include/pcg/computation_graph_builder.h +++ b/lib/pcg/include/pcg/computation_graph_builder.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_PCG_INCLUDE_PCG_COMPUTATION_GRAPH_BUILDER_H #include "pcg/computation_graph.dtg.h" +#include "pcg/computation_graph/layer_added_result.dtg.h" #include "pcg/initializer_attrs.dtg.h" #include "pcg/tensor_guid_t.dtg.h" @@ -256,6 +257,12 @@ struct ComputationGraphBuilder { std::vector get_outputs(LayerAttrs const &) const; tensor_guid_t get_output(LayerAttrs const &, int idx) const; + LayerAddedResult add_layer_and_get_layer_added_result( + LayerAttrs const &layer, + std::vector const &inputs, + std::vector const &weights, + std::vector const &outputs); + std::vector add_layer(LayerAttrs const &layer, std::vector const &inputs, diff --git a/lib/pcg/src/pcg/computation_graph_builder.cc b/lib/pcg/src/pcg/computation_graph_builder.cc index 4a565476bd..4c619288cb 100644 --- a/lib/pcg/src/pcg/computation_graph_builder.cc +++ b/lib/pcg/src/pcg/computation_graph_builder.cc @@ -106,7 +106,7 @@ static void check_incoming_tensor_roles(LayerAttrs const &layer, } } -std::vector ComputationGraphBuilder::add_layer( +LayerAddedResult ComputationGraphBuilder::add_layer_and_get_layer_added_result( LayerAttrs const &layer, std::vector const &inputs, std::vector const &weights, @@ -115,7 +115,17 @@ std::vector ComputationGraphBuilder::add_layer( LayerAddedResult added = ::FlexFlow::add_layer( this->computation_graph, layer, concat_vectors(inputs, weights), outputs); - return added.outputs; + return added; +} + +std::vector ComputationGraphBuilder::add_layer( + LayerAttrs const &layer, + std::vector const &inputs, + std::vector const &weights, + std::vector const &outputs) { + return this + ->add_layer_and_get_layer_added_result(layer, inputs, weights, outputs) + .outputs; } tensor_guid_t ComputationGraphBuilder::as_type(tensor_guid_t const &x, From a73b1c325f819f1ffdcdc0ce38fda1e25fd2eb28 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 13 Nov 2024 09:22:43 -0800 Subject: [PATCH 21/91] Expose op folders publicly --- .../local-execution/model_training_instance.h | 2 - .../local-execution}/ops/attention.h | 0 .../local-execution}/ops/batch_matmul.h | 0 .../local-execution}/ops/batch_norm.h | 0 .../local-execution}/ops/cast.h | 0 .../local-execution}/ops/combine.h | 0 .../local-execution}/ops/concat.h | 0 .../local-execution}/ops/conv_2d.h | 0 .../local-execution}/ops/dropout.h | 0 .../local-execution}/ops/element_binary.h | 0 .../local-execution}/ops/element_unary.h | 0 .../local-execution}/ops/embedding.h | 0 .../local-execution}/ops/flat.h | 0 .../local-execution}/ops/gather.h | 0 .../local-execution}/ops/input.h | 0 .../local-execution}/ops/layer_norm.h | 0 .../local-execution}/ops/linear.h | 0 .../local-execution}/ops/noop.h | 0 .../local-execution}/ops/parallel_op.h | 0 .../local-execution}/ops/pool_2d.h | 0 .../local-execution}/ops/reduce.h | 0 .../local-execution}/ops/reduction.h | 0 .../local-execution}/ops/repartition.h | 0 .../local-execution}/ops/replicate.h | 0 .../local-execution}/ops/reshape.h | 0 .../local-execution}/ops/reverse.h | 0 .../local-execution}/ops/softmax.h | 0 .../local-execution}/ops/split.h | 0 .../local-execution}/ops/topk.h | 0 .../local-execution}/ops/transpose.h | 0 .../local-execution}/ops/weight.h | 0 .../src/model_training_instance.cc | 11 +--- lib/local-execution/src/ops/attention.cc | 2 +- lib/local-execution/src/ops/batch_matmul.cc | 2 +- lib/local-execution/src/ops/batch_norm.cc | 2 +- lib/local-execution/src/ops/cast.cc | 2 +- lib/local-execution/src/ops/combine.cc | 2 +- lib/local-execution/src/ops/concat.cc | 2 +- lib/local-execution/src/ops/conv_2d.cc | 2 +- lib/local-execution/src/ops/dropout.cc | 2 +- lib/local-execution/src/ops/element_binary.cc | 2 +- lib/local-execution/src/ops/element_unary.cc | 2 +- lib/local-execution/src/ops/flat.cc | 2 +- lib/local-execution/src/ops/gather.cc | 2 +- lib/local-execution/src/ops/input.cc | 2 +- lib/local-execution/src/ops/layer_norm.cc | 2 +- lib/local-execution/src/ops/linear.cc | 2 +- lib/local-execution/src/ops/noop.cc | 2 +- lib/local-execution/src/ops/pool_2d.cc | 2 +- lib/local-execution/src/ops/reduce.cc | 2 +- lib/local-execution/src/ops/reduction.cc | 2 +- lib/local-execution/src/ops/repartition.cc | 2 +- lib/local-execution/src/ops/replicate.cc | 2 +- lib/local-execution/src/ops/reshape.cc | 2 +- lib/local-execution/src/ops/reverse.cc | 2 +- lib/local-execution/src/ops/softmax.cc | 2 +- lib/local-execution/src/ops/split.cc | 2 +- lib/local-execution/src/ops/topk.cc | 2 +- lib/local-execution/src/ops/transpose.cc | 2 +- lib/local-execution/src/ops/weight.cc | 2 +- .../src/task_signature_impl.cc | 58 +++++++++---------- .../include/op-attrs/operator_attrs.h | 58 +++++++++---------- 62 files changed, 89 insertions(+), 96 deletions(-) rename lib/local-execution/{src => include/local-execution}/ops/attention.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/batch_matmul.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/batch_norm.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/cast.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/combine.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/concat.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/conv_2d.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/dropout.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/element_binary.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/element_unary.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/embedding.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/flat.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/gather.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/input.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/layer_norm.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/linear.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/noop.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/parallel_op.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/pool_2d.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/reduce.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/reduction.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/repartition.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/replicate.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/reshape.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/reverse.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/softmax.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/split.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/topk.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/transpose.h (100%) rename lib/local-execution/{src => include/local-execution}/ops/weight.h (100%) diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h index 08f373a16f..14473ff26e 100644 --- a/lib/local-execution/include/local-execution/model_training_instance.h +++ b/lib/local-execution/include/local-execution/model_training_instance.h @@ -19,8 +19,6 @@ struct ModelTrainingInstance { tensor_guid_t const & label_tensor, OptimizerAttrs const &); - void register_and_allocate_layers(); - void allocate_optimizer_tensors(); void execute_init(); PerLayerElapsedTime execute_forward(); PerLayerElapsedTime execute_backward(); diff --git a/lib/local-execution/src/ops/attention.h b/lib/local-execution/include/local-execution/ops/attention.h similarity index 100% rename from lib/local-execution/src/ops/attention.h rename to lib/local-execution/include/local-execution/ops/attention.h diff --git a/lib/local-execution/src/ops/batch_matmul.h b/lib/local-execution/include/local-execution/ops/batch_matmul.h similarity index 100% rename from lib/local-execution/src/ops/batch_matmul.h rename to lib/local-execution/include/local-execution/ops/batch_matmul.h diff --git a/lib/local-execution/src/ops/batch_norm.h b/lib/local-execution/include/local-execution/ops/batch_norm.h similarity index 100% rename from lib/local-execution/src/ops/batch_norm.h rename to lib/local-execution/include/local-execution/ops/batch_norm.h diff --git a/lib/local-execution/src/ops/cast.h b/lib/local-execution/include/local-execution/ops/cast.h similarity index 100% rename from lib/local-execution/src/ops/cast.h rename to lib/local-execution/include/local-execution/ops/cast.h diff --git a/lib/local-execution/src/ops/combine.h b/lib/local-execution/include/local-execution/ops/combine.h similarity index 100% rename from lib/local-execution/src/ops/combine.h rename to lib/local-execution/include/local-execution/ops/combine.h diff --git a/lib/local-execution/src/ops/concat.h b/lib/local-execution/include/local-execution/ops/concat.h similarity index 100% rename from lib/local-execution/src/ops/concat.h rename to lib/local-execution/include/local-execution/ops/concat.h diff --git a/lib/local-execution/src/ops/conv_2d.h b/lib/local-execution/include/local-execution/ops/conv_2d.h similarity index 100% rename from lib/local-execution/src/ops/conv_2d.h rename to lib/local-execution/include/local-execution/ops/conv_2d.h diff --git a/lib/local-execution/src/ops/dropout.h b/lib/local-execution/include/local-execution/ops/dropout.h similarity index 100% rename from lib/local-execution/src/ops/dropout.h rename to lib/local-execution/include/local-execution/ops/dropout.h diff --git a/lib/local-execution/src/ops/element_binary.h b/lib/local-execution/include/local-execution/ops/element_binary.h similarity index 100% rename from lib/local-execution/src/ops/element_binary.h rename to lib/local-execution/include/local-execution/ops/element_binary.h diff --git a/lib/local-execution/src/ops/element_unary.h b/lib/local-execution/include/local-execution/ops/element_unary.h similarity index 100% rename from lib/local-execution/src/ops/element_unary.h rename to lib/local-execution/include/local-execution/ops/element_unary.h diff --git a/lib/local-execution/src/ops/embedding.h b/lib/local-execution/include/local-execution/ops/embedding.h similarity index 100% rename from lib/local-execution/src/ops/embedding.h rename to lib/local-execution/include/local-execution/ops/embedding.h diff --git a/lib/local-execution/src/ops/flat.h b/lib/local-execution/include/local-execution/ops/flat.h similarity index 100% rename from lib/local-execution/src/ops/flat.h rename to lib/local-execution/include/local-execution/ops/flat.h diff --git a/lib/local-execution/src/ops/gather.h b/lib/local-execution/include/local-execution/ops/gather.h similarity index 100% rename from lib/local-execution/src/ops/gather.h rename to lib/local-execution/include/local-execution/ops/gather.h diff --git a/lib/local-execution/src/ops/input.h b/lib/local-execution/include/local-execution/ops/input.h similarity index 100% rename from lib/local-execution/src/ops/input.h rename to lib/local-execution/include/local-execution/ops/input.h diff --git a/lib/local-execution/src/ops/layer_norm.h b/lib/local-execution/include/local-execution/ops/layer_norm.h similarity index 100% rename from lib/local-execution/src/ops/layer_norm.h rename to lib/local-execution/include/local-execution/ops/layer_norm.h diff --git a/lib/local-execution/src/ops/linear.h b/lib/local-execution/include/local-execution/ops/linear.h similarity index 100% rename from lib/local-execution/src/ops/linear.h rename to lib/local-execution/include/local-execution/ops/linear.h diff --git a/lib/local-execution/src/ops/noop.h b/lib/local-execution/include/local-execution/ops/noop.h similarity index 100% rename from lib/local-execution/src/ops/noop.h rename to lib/local-execution/include/local-execution/ops/noop.h diff --git a/lib/local-execution/src/ops/parallel_op.h b/lib/local-execution/include/local-execution/ops/parallel_op.h similarity index 100% rename from lib/local-execution/src/ops/parallel_op.h rename to lib/local-execution/include/local-execution/ops/parallel_op.h diff --git a/lib/local-execution/src/ops/pool_2d.h b/lib/local-execution/include/local-execution/ops/pool_2d.h similarity index 100% rename from lib/local-execution/src/ops/pool_2d.h rename to lib/local-execution/include/local-execution/ops/pool_2d.h diff --git a/lib/local-execution/src/ops/reduce.h b/lib/local-execution/include/local-execution/ops/reduce.h similarity index 100% rename from lib/local-execution/src/ops/reduce.h rename to lib/local-execution/include/local-execution/ops/reduce.h diff --git a/lib/local-execution/src/ops/reduction.h b/lib/local-execution/include/local-execution/ops/reduction.h similarity index 100% rename from lib/local-execution/src/ops/reduction.h rename to lib/local-execution/include/local-execution/ops/reduction.h diff --git a/lib/local-execution/src/ops/repartition.h b/lib/local-execution/include/local-execution/ops/repartition.h similarity index 100% rename from lib/local-execution/src/ops/repartition.h rename to lib/local-execution/include/local-execution/ops/repartition.h diff --git a/lib/local-execution/src/ops/replicate.h b/lib/local-execution/include/local-execution/ops/replicate.h similarity index 100% rename from lib/local-execution/src/ops/replicate.h rename to lib/local-execution/include/local-execution/ops/replicate.h diff --git a/lib/local-execution/src/ops/reshape.h b/lib/local-execution/include/local-execution/ops/reshape.h similarity index 100% rename from lib/local-execution/src/ops/reshape.h rename to lib/local-execution/include/local-execution/ops/reshape.h diff --git a/lib/local-execution/src/ops/reverse.h b/lib/local-execution/include/local-execution/ops/reverse.h similarity index 100% rename from lib/local-execution/src/ops/reverse.h rename to lib/local-execution/include/local-execution/ops/reverse.h diff --git a/lib/local-execution/src/ops/softmax.h b/lib/local-execution/include/local-execution/ops/softmax.h similarity index 100% rename from lib/local-execution/src/ops/softmax.h rename to lib/local-execution/include/local-execution/ops/softmax.h diff --git a/lib/local-execution/src/ops/split.h b/lib/local-execution/include/local-execution/ops/split.h similarity index 100% rename from lib/local-execution/src/ops/split.h rename to lib/local-execution/include/local-execution/ops/split.h diff --git a/lib/local-execution/src/ops/topk.h b/lib/local-execution/include/local-execution/ops/topk.h similarity index 100% rename from lib/local-execution/src/ops/topk.h rename to lib/local-execution/include/local-execution/ops/topk.h diff --git a/lib/local-execution/src/ops/transpose.h b/lib/local-execution/include/local-execution/ops/transpose.h similarity index 100% rename from lib/local-execution/src/ops/transpose.h rename to lib/local-execution/include/local-execution/ops/transpose.h diff --git a/lib/local-execution/src/ops/weight.h b/lib/local-execution/include/local-execution/ops/weight.h similarity index 100% rename from lib/local-execution/src/ops/weight.h rename to lib/local-execution/include/local-execution/ops/weight.h diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc index 7256a82478..abdced1bb5 100644 --- a/lib/local-execution/src/model_training_instance.cc +++ b/lib/local-execution/src/model_training_instance.cc @@ -14,16 +14,11 @@ ModelTrainingInstance::ModelTrainingInstance(Allocator const & allocator, tensor_guid_t const &label_tensor, OptimizerAttrs const & optimizer_attrs) : computation_graph(computation_graph), training_backing(allocator, computation_graph, tensor_backing_map, runtime_arg_config), - loss_attrs(loss_attrs), logit_tensor(logit_tensor), label_tensor(label_tensor), optimizer_attrs(optimizer_attrs) {} + loss_attrs(loss_attrs), logit_tensor(logit_tensor), label_tensor(label_tensor), optimizer_attrs(optimizer_attrs) { -void ModelTrainingInstance::register_and_allocate_layers() { + // allocate each layer's tensors for (layer_guid_t const & node: topological_ordering(this->computation_graph)) { this->training_backing.register_and_allocate_layer(node); - } -} - -void ModelTrainingInstance::allocate_optimizer_tensors() { - for (layer_guid_t const & node: topological_ordering(this->computation_graph)) { this->training_backing.allocate_layer_optimizer_tensors(node, this->optimizer_attrs); } } @@ -61,4 +56,4 @@ void ModelTrainingInstance::execute_update() { this->optimizer_attrs = get_next_iteration_optimizer_attrs(this->optimizer_attrs); } -} +} // namespace FlexFlow diff --git a/lib/local-execution/src/ops/attention.cc b/lib/local-execution/src/ops/attention.cc index 5e693d43db..b4c5d1ff8a 100644 --- a/lib/local-execution/src/ops/attention.cc +++ b/lib/local-execution/src/ops/attention.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "attention.h" +#include "local-execution/ops/attention.h" #include "kernels/attention_kernels.h" #include "local-execution/op_task_signature.h" #include "op-attrs/ops/attention.h" diff --git a/lib/local-execution/src/ops/batch_matmul.cc b/lib/local-execution/src/ops/batch_matmul.cc index d60a003061..e358e0a645 100644 --- a/lib/local-execution/src/ops/batch_matmul.cc +++ b/lib/local-execution/src/ops/batch_matmul.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "batch_matmul.h" +#include "local-execution/ops/batch_matmul.h" #include "kernels/batch_matmul_kernels.h" #include "local-execution/op_task_signature.h" #include "op-attrs/get_output_shapes.h" diff --git a/lib/local-execution/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc index 254d7ef39e..62155aa161 100644 --- a/lib/local-execution/src/ops/batch_norm.cc +++ b/lib/local-execution/src/ops/batch_norm.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "batch_norm.h" +#include "local-execution/ops/batch_norm.h" #include "kernels/batch_norm_kernels.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/cast.cc b/lib/local-execution/src/ops/cast.cc index d3e43a46a0..846faa9262 100644 --- a/lib/local-execution/src/ops/cast.cc +++ b/lib/local-execution/src/ops/cast.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "cast.h" +#include "local-execution/ops/cast.h" #include "kernels/cast_kernels.h" #include "local-execution/op_task_signature.h" diff --git a/lib/local-execution/src/ops/combine.cc b/lib/local-execution/src/ops/combine.cc index 92f2931344..b7e84878f4 100644 --- a/lib/local-execution/src/ops/combine.cc +++ b/lib/local-execution/src/ops/combine.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "combine.h" +#include "local-execution/ops/combine.h" #include "kernels/combine_kernels.h" #include "local-execution/op_task_invocation.h" #include "utils/hash-utils.h" diff --git a/lib/local-execution/src/ops/concat.cc b/lib/local-execution/src/ops/concat.cc index 42d98c336a..dee1dd08e5 100644 --- a/lib/local-execution/src/ops/concat.cc +++ b/lib/local-execution/src/ops/concat.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "concat.h" +#include "local-execution/ops/concat.h" #include "kernels/concat_kernels.h" #include "local-execution/op_task_signature.h" diff --git a/lib/local-execution/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc index 7694a03947..7ae92d70c7 100644 --- a/lib/local-execution/src/ops/conv_2d.cc +++ b/lib/local-execution/src/ops/conv_2d.cc @@ -1,4 +1,4 @@ -#include "conv_2d.h" +#include "local-execution/ops/conv_2d.h" #include "kernels/conv_2d_kernels.h" #include "op-attrs/get_output_shapes.h" diff --git a/lib/local-execution/src/ops/dropout.cc b/lib/local-execution/src/ops/dropout.cc index 77a2963313..017d023ec4 100644 --- a/lib/local-execution/src/ops/dropout.cc +++ b/lib/local-execution/src/ops/dropout.cc @@ -1,4 +1,4 @@ -#include "dropout.h" +#include "local-execution/ops/dropout.h" #include "kernels/dropout_kernels.h" #include "local-execution/op_task_invocation.h" #include "local-execution/op_task_signature.h" diff --git a/lib/local-execution/src/ops/element_binary.cc b/lib/local-execution/src/ops/element_binary.cc index 2152b1beea..d4c12c7285 100644 --- a/lib/local-execution/src/ops/element_binary.cc +++ b/lib/local-execution/src/ops/element_binary.cc @@ -1,4 +1,4 @@ -#include "element_binary.h" +#include "local-execution/ops/element_binary.h" #include "kernels/element_binary_kernels.h" #include "local-execution/task_signature_impl.h" #include "op-attrs/get_output_shapes.h" diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/ops/element_unary.cc index ccb41d7461..85ecf3db23 100644 --- a/lib/local-execution/src/ops/element_unary.cc +++ b/lib/local-execution/src/ops/element_unary.cc @@ -1,4 +1,4 @@ -#include "element_unary.h" +#include "local-execution/ops/element_unary.h" #include "kernels/element_unary_kernels.h" #include "op-attrs/get_output_shapes.h" #include "op-attrs/parallel_tensor_shape.h" diff --git a/lib/local-execution/src/ops/flat.cc b/lib/local-execution/src/ops/flat.cc index 8df5703f60..ef4dc7ab68 100644 --- a/lib/local-execution/src/ops/flat.cc +++ b/lib/local-execution/src/ops/flat.cc @@ -1,4 +1,4 @@ -#include "flat.h" +#include "local-execution/ops/flat.h" #include "kernels/flat_kernels.h" #include "op-attrs/get_output_shapes.h" diff --git a/lib/local-execution/src/ops/gather.cc b/lib/local-execution/src/ops/gather.cc index 558988f9a4..180026e9ba 100644 --- a/lib/local-execution/src/ops/gather.cc +++ b/lib/local-execution/src/ops/gather.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "gather.h" +#include "local-execution/ops/gather.h" #include "kernels/gather_kernels.h" #include "local-execution/legion_tensor_shape.h" #include "op-attrs/get_output_shapes.h" diff --git a/lib/local-execution/src/ops/input.cc b/lib/local-execution/src/ops/input.cc index 56d19fa1ba..d7a3888220 100644 --- a/lib/local-execution/src/ops/input.cc +++ b/lib/local-execution/src/ops/input.cc @@ -1,4 +1,4 @@ -#include "input.h" +#include "local-execution/ops/input.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/layer_norm.cc b/lib/local-execution/src/ops/layer_norm.cc index b1f44d69ae..c9e2a8d55e 100644 --- a/lib/local-execution/src/ops/layer_norm.cc +++ b/lib/local-execution/src/ops/layer_norm.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "layer_norm.h" +#include "local-execution/ops/layer_norm.h" #include "kernels/layer_norm_kernels.h" #include "local-execution/legion_tensor_shape.h" #include "op-attrs/get_output_shapes.h" diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc index 9e29a0cce0..075aa1d9e4 100644 --- a/lib/local-execution/src/ops/linear.cc +++ b/lib/local-execution/src/ops/linear.cc @@ -1,4 +1,4 @@ -#include "linear.h" +#include "local-execution/ops/linear.h" #include "kernels/linear_kernels.h" #include "local-execution/task_argument_accessor.h" #include "op-attrs/ff_dim.h" diff --git a/lib/local-execution/src/ops/noop.cc b/lib/local-execution/src/ops/noop.cc index e35fdec275..7357806880 100644 --- a/lib/local-execution/src/ops/noop.cc +++ b/lib/local-execution/src/ops/noop.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "noop.h" +#include "local-execution/ops/noop.h" namespace FlexFlow { diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc index 126f57be0d..66f27fa69f 100644 --- a/lib/local-execution/src/ops/pool_2d.cc +++ b/lib/local-execution/src/ops/pool_2d.cc @@ -1,4 +1,4 @@ -#include "pool_2d.h" +#include "local-execution/ops/pool_2d.h" #include "kernels/pool_2d_kernels.h" #include "op-attrs/get_output_shapes.h" diff --git a/lib/local-execution/src/ops/reduce.cc b/lib/local-execution/src/ops/reduce.cc index 01d2f0e86f..c157a98b36 100644 --- a/lib/local-execution/src/ops/reduce.cc +++ b/lib/local-execution/src/ops/reduce.cc @@ -1,4 +1,4 @@ -#include "reduce.h" +#include "local-execution/ops/reduce.h" #include "kernels/reduce_kernels.h" #include "op-attrs/get_output_shapes.h" diff --git a/lib/local-execution/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc index f946b7d146..95962661e2 100644 --- a/lib/local-execution/src/ops/reduction.cc +++ b/lib/local-execution/src/ops/reduction.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "reduction.h" +#include "local-execution/ops/reduction.h" #include "kernels/reduction_kernels.h" #include "op-attrs/get_output_shapes.h" #include "utils/exception.h" diff --git a/lib/local-execution/src/ops/repartition.cc b/lib/local-execution/src/ops/repartition.cc index e260fd77f5..9bba8109f3 100644 --- a/lib/local-execution/src/ops/repartition.cc +++ b/lib/local-execution/src/ops/repartition.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "repartition.h" +#include "local-execution/ops/repartition.h" #include "kernels/partition_kernels.h" #include "op-attrs/get_output_shapes.h" #include "utils/exception.h" diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc index 10cd80a6d9..5ae93c4439 100644 --- a/lib/local-execution/src/ops/replicate.cc +++ b/lib/local-execution/src/ops/replicate.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "replicate.h" +#include "local-execution/ops/replicate.h" #include "kernels/replicate_kernels.h" #include "op-attrs/get_output_shapes.h" #include "op-attrs/parallel_tensor_shape.h" diff --git a/lib/local-execution/src/ops/reshape.cc b/lib/local-execution/src/ops/reshape.cc index 433e961a8a..838542a8eb 100644 --- a/lib/local-execution/src/ops/reshape.cc +++ b/lib/local-execution/src/ops/reshape.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "reshape.h" +#include "local-execution/ops/reshape.h" #include "kernels/reshape_kernels.h" #include "op-attrs/get_output_shapes.h" diff --git a/lib/local-execution/src/ops/reverse.cc b/lib/local-execution/src/ops/reverse.cc index b767b61b20..63032585b8 100644 --- a/lib/local-execution/src/ops/reverse.cc +++ b/lib/local-execution/src/ops/reverse.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "reverse.h" +#include "local-execution/ops/reverse.h" #include "kernels/accessor.h" #include "kernels/reverse_kernels.h" #include "op-attrs/get_output_shapes.h" diff --git a/lib/local-execution/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc index 36c4afcaf3..5e78781ddc 100644 --- a/lib/local-execution/src/ops/softmax.cc +++ b/lib/local-execution/src/ops/softmax.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "softmax.h" +#include "local-execution/ops/softmax.h" #include "kernels/softmax_kernels.h" #include "op-attrs/get_output_shapes.h" #include "op-attrs/parallel_tensor_shape.h" diff --git a/lib/local-execution/src/ops/split.cc b/lib/local-execution/src/ops/split.cc index dc627aae96..556d30109b 100644 --- a/lib/local-execution/src/ops/split.cc +++ b/lib/local-execution/src/ops/split.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "split.h" +#include "local-execution/ops/split.h" #include "kernels/array_shape.h" #include "kernels/split_kernels.h" #include "op-attrs/get_output_shapes.h" diff --git a/lib/local-execution/src/ops/topk.cc b/lib/local-execution/src/ops/topk.cc index ea4fc09e19..41a28340db 100644 --- a/lib/local-execution/src/ops/topk.cc +++ b/lib/local-execution/src/ops/topk.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "topk.h" +#include "local-execution/ops/topk.h" #include "kernels/topk_kernels.h" #include "op-attrs/get_output_shapes.h" #include "utils/exception.h" diff --git a/lib/local-execution/src/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc index 435df464c0..78e9fbde6f 100644 --- a/lib/local-execution/src/ops/transpose.cc +++ b/lib/local-execution/src/ops/transpose.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "transpose.h" +#include "local-execution/ops/transpose.h" #include "kernels/transpose_kernels.h" #include "op-attrs/get_output_shapes.h" #include "op-attrs/ops/transpose.h" diff --git a/lib/local-execution/src/ops/weight.cc b/lib/local-execution/src/ops/weight.cc index 5537163e85..f96c104f33 100644 --- a/lib/local-execution/src/ops/weight.cc +++ b/lib/local-execution/src/ops/weight.cc @@ -1,4 +1,4 @@ -#include "weight.h" +#include "local-execution/ops/weight.h" namespace FlexFlow { diff --git a/lib/local-execution/src/task_signature_impl.cc b/lib/local-execution/src/task_signature_impl.cc index 3072b9a8bd..199e232a6b 100644 --- a/lib/local-execution/src/task_signature_impl.cc +++ b/lib/local-execution/src/task_signature_impl.cc @@ -1,33 +1,33 @@ #include "local-execution/task_signature_impl.h" -#include "ops/attention.h" -#include "ops/batch_matmul.h" -#include "ops/batch_norm.h" -#include "ops/cast.h" -#include "ops/combine.h" -#include "ops/concat.h" -#include "ops/conv_2d.h" -#include "ops/dropout.h" -#include "ops/element_binary.h" -#include "ops/element_unary.h" -#include "ops/embedding.h" -#include "ops/flat.h" -#include "ops/gather.h" -#include "ops/input.h" -#include "ops/layer_norm.h" -#include "ops/linear.h" -#include "ops/noop.h" -#include "ops/pool_2d.h" -#include "ops/reduce.h" -#include "ops/reduction.h" -#include "ops/repartition.h" -#include "ops/replicate.h" -#include "ops/reshape.h" -#include "ops/reverse.h" -#include "ops/softmax.h" -#include "ops/split.h" -#include "ops/topk.h" -#include "ops/transpose.h" -#include "ops/weight.h" +#include "local-execution/ops/attention.h" +#include "local-execution/ops/batch_matmul.h" +#include "local-execution/ops/batch_norm.h" +#include "local-execution/ops/cast.h" +#include "local-execution/ops/combine.h" +#include "local-execution/ops/concat.h" +#include "local-execution/ops/conv_2d.h" +#include "local-execution/ops/dropout.h" +#include "local-execution/ops/element_binary.h" +#include "local-execution/ops/element_unary.h" +#include "local-execution/ops/embedding.h" +#include "local-execution/ops/flat.h" +#include "local-execution/ops/gather.h" +#include "local-execution/ops/input.h" +#include "local-execution/ops/layer_norm.h" +#include "local-execution/ops/linear.h" +#include "local-execution/ops/noop.h" +#include "local-execution/ops/pool_2d.h" +#include "local-execution/ops/reduce.h" +#include "local-execution/ops/reduction.h" +#include "local-execution/ops/repartition.h" +#include "local-execution/ops/replicate.h" +#include "local-execution/ops/reshape.h" +#include "local-execution/ops/reverse.h" +#include "local-execution/ops/softmax.h" +#include "local-execution/ops/split.h" +#include "local-execution/ops/topk.h" +#include "local-execution/ops/transpose.h" +#include "local-execution/ops/weight.h" #include "utils/overload.h" namespace FlexFlow { diff --git a/lib/op-attrs/include/op-attrs/operator_attrs.h b/lib/op-attrs/include/op-attrs/operator_attrs.h index 268554b5be..11afc5b209 100644 --- a/lib/op-attrs/include/op-attrs/operator_attrs.h +++ b/lib/op-attrs/include/op-attrs/operator_attrs.h @@ -3,35 +3,35 @@ #include "op-attrs/ops/core.h" #include "op-attrs/pcg_operator_attrs.dtg.h" -#include "ops/attention.h" -#include "ops/batch_matmul.h" -#include "ops/batch_norm.h" -#include "ops/broadcast.h" -#include "ops/cast.h" -#include "ops/combine.h" -#include "ops/concat.h" -#include "ops/conv_2d.h" -#include "ops/dropout.h" -#include "ops/element_binary.h" -#include "ops/element_unary.h" -#include "ops/embedding.h" -#include "ops/flat.h" -#include "ops/gather.h" -#include "ops/input.h" -#include "ops/layer_norm.h" -#include "ops/linear.h" -#include "ops/noop.h" -#include "ops/pool_2d.h" -#include "ops/reduce.h" -#include "ops/reduction.h" -#include "ops/repartition.h" -#include "ops/replicate.h" -#include "ops/reshape.h" -#include "ops/reverse.h" -#include "ops/softmax.h" -#include "ops/split.h" -#include "ops/topk.h" -#include "ops/transpose.h" +#include "local-execution/ops/attention.h" +#include "local-execution/ops/batch_matmul.h" +#include "local-execution/ops/batch_norm.h" +#include "local-execution/ops/broadcast.h" +#include "local-execution/ops/cast.h" +#include "local-execution/ops/combine.h" +#include "local-execution/ops/concat.h" +#include "local-execution/ops/conv_2d.h" +#include "local-execution/ops/dropout.h" +#include "local-execution/ops/element_binary.h" +#include "local-execution/ops/element_unary.h" +#include "local-execution/ops/embedding.h" +#include "local-execution/ops/flat.h" +#include "local-execution/ops/gather.h" +#include "local-execution/ops/input.h" +#include "local-execution/ops/layer_norm.h" +#include "local-execution/ops/linear.h" +#include "local-execution/ops/noop.h" +#include "local-execution/ops/pool_2d.h" +#include "local-execution/ops/reduce.h" +#include "local-execution/ops/reduction.h" +#include "local-execution/ops/repartition.h" +#include "local-execution/ops/replicate.h" +#include "local-execution/ops/reshape.h" +#include "local-execution/ops/reverse.h" +#include "local-execution/ops/softmax.h" +#include "local-execution/ops/split.h" +#include "local-execution/ops/topk.h" +#include "local-execution/ops/transpose.h" #include "utils/record_formatter.h" #include "utils/variant.h" #include From c6fed294c5b31001f978123c43681c0db32b3e0b Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 13 Nov 2024 13:24:19 -0800 Subject: [PATCH 22/91] Add tensor type, operate over reduced tensor --- .../local-execution/itask_argument_accessor.h | 8 +- .../layer_tensor_key.struct.toml | 23 +++ .../local-execution/local_slots_backing.h | 45 +++--- .../local_task_argument_accessor.h | 10 +- .../local-execution/local_training_backing.h | 13 +- .../include/local-execution/loss_functions.h | 2 +- .../local-execution/model_training_instance.h | 13 +- .../local-execution/op_task_invocation.h | 6 +- .../op_tensor_slot_spec.struct.toml | 6 +- .../include/local-execution/optimizer.h | 14 +- .../reduced_tensor_t.struct.toml | 13 ++ ...t.toml => slot_tensor_type_id.struct.toml} | 8 +- .../local-execution/task_argument_accessor.h | 59 +++++++- .../include/local-execution/task_binding.h | 12 +- .../include/local-execution/task_signature.h | 4 +- .../task_signature.struct.toml | 4 +- .../tensor_guid_slot_spec.struct.toml | 22 --- .../tensor_guid_spec.struct.toml | 23 --- .../local-execution/tensor_reduction.h | 15 ++ .../local-execution/tensor_type.enum.toml | 20 +++ .../tensor_type_slot_spec.struct.toml | 26 ++++ .../unified_tensor_guid.variant.toml | 21 --- .../src/local_slots_backing.cc | 137 +++++++++--------- .../src/local_task_argument_accessor.cc | 12 +- .../src/local_training_backing.cc | 40 ++--- lib/local-execution/src/loss_functions.cc | 19 ++- .../src/model_training_instance.cc | 64 +++++--- lib/local-execution/src/op_task_invocation.cc | 14 +- lib/local-execution/src/op_task_signature.cc | 42 ++++-- lib/local-execution/src/optimizer.cc | 45 +++--- lib/local-execution/src/task_binding.cc | 17 ++- lib/local-execution/src/task_signature.cc | 10 +- lib/local-execution/src/tensor_reduction.cc | 17 +++ .../include/op-attrs/operator_attrs.h | 4 +- 34 files changed, 471 insertions(+), 317 deletions(-) create mode 100644 lib/local-execution/include/local-execution/layer_tensor_key.struct.toml create mode 100644 lib/local-execution/include/local-execution/reduced_tensor_t.struct.toml rename lib/local-execution/include/local-execution/{slot_grad_id.struct.toml => slot_tensor_type_id.struct.toml} (62%) delete mode 100644 lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml delete mode 100644 lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml create mode 100644 lib/local-execution/include/local-execution/tensor_reduction.h create mode 100644 lib/local-execution/include/local-execution/tensor_type.enum.toml create mode 100644 lib/local-execution/include/local-execution/tensor_type_slot_spec.struct.toml delete mode 100644 lib/local-execution/include/local-execution/unified_tensor_guid.variant.toml create mode 100644 lib/local-execution/src/tensor_reduction.cc diff --git a/lib/local-execution/include/local-execution/itask_argument_accessor.h b/lib/local-execution/include/local-execution/itask_argument_accessor.h index b4d188e4a3..9eff9460c2 100644 --- a/lib/local-execution/include/local-execution/itask_argument_accessor.h +++ b/lib/local-execution/include/local-execution/itask_argument_accessor.h @@ -5,6 +5,7 @@ #include "local-execution/concrete_arg.h" #include "local-execution/op_task_signature.h" #include "local-execution/privilege_tensor_accessor.h" +#include "local-execution/tensor_type.dtg.h" namespace FlexFlow { @@ -15,10 +16,11 @@ struct ITaskArgumentAccessor { virtual ConcreteArgSpec const &get_concrete_arg(slot_id_t) const = 0; - virtual GenericTensorAccessor - get_tensor(slot_id_t slot, Permissions priv, IsGrad is_grad) const = 0; + virtual GenericTensorAccessor get_tensor(slot_id_t slot, + Permissions priv, + TensorType tensor_type) const = 0; virtual VariadicGenericTensorAccessor get_variadic_tensor( - slot_id_t slot, Permissions priv, IsGrad is_grad) const = 0; + slot_id_t slot, Permissions priv, TensorType tensor_type) const = 0; virtual Allocator get_allocator() const = 0; virtual size_t get_device_idx() const = 0; diff --git a/lib/local-execution/include/local-execution/layer_tensor_key.struct.toml b/lib/local-execution/include/local-execution/layer_tensor_key.struct.toml new file mode 100644 index 0000000000..3ec6d7b0f1 --- /dev/null +++ b/lib/local-execution/include/local-execution/layer_tensor_key.struct.toml @@ -0,0 +1,23 @@ +namespace = "FlexFlow" +name = "LayerTensorKey" +features = [ + "eq", + "ord", + "hash", + "json", + "rapidcheck", + "fmt", +] + +includes = [ + "pcg/layer_guid_t.dtg.h", + "local-execution/reduced_tensor_t.dtg.h" +] + +[[fields]] +name = "layer_guid" +type = "::FlexFlow::layer_guid_t" + +[[fields]] +name = "reduced_tensor" +type = "::FlexFlow::reduced_tensor_t" diff --git a/lib/local-execution/include/local-execution/local_slots_backing.h b/lib/local-execution/include/local-execution/local_slots_backing.h index 46e66e97a2..a632f432cf 100644 --- a/lib/local-execution/include/local-execution/local_slots_backing.h +++ b/lib/local-execution/include/local-execution/local_slots_backing.h @@ -3,6 +3,7 @@ #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_SLOTS_BACKING_H #include "kernels/accessor.h" +#include "local-execution/layer_tensor_key.dtg.h" #include "local-execution/local_task_argument_accessor.h" #include "local-execution/non_graph_tensor_guid_t.dtg.h" #include "local-execution/op_task_invocation.h" @@ -10,26 +11,25 @@ #include "local-execution/runtime_arg_config.h" #include "local-execution/task_invocation.dtg.h" #include "local-execution/tensor_role.dtg.h" -#include "local-execution/unified_tensor_guid.dtg.h" #include "pcg/computation_graph.dtg.h" -#include "pcg/layer_guid_t.dtg.h" #include "pcg/tensor_guid_t.dtg.h" namespace FlexFlow { +using LayerTensorBackingMap = + std::unordered_map; + using TensorBackingMap = - std::unordered_map; -using NonGraphTensorBackingMap = - std::unordered_map; + std::unordered_map; struct LocalSlotsBacking { - LocalSlotsBacking(TensorBackingMap const &, RuntimeArgConfig const &); + LocalSlotsBacking(LayerTensorBackingMap const &allocated_forward_tensors, + TensorBackingMap const &allocated_non_graph_tensors, + RuntimeArgConfig const &); public: void add_per_device_op_state(layer_guid_t const &, DeviceSpecificDeviceStates const &); - void insert_into_tensor_mapping(tensor_guid_t const &, - GenericTensorAccessorW const &); void allocate_layer_tensors(layer_guid_t const &, ComputationGraph const &, Allocator &); @@ -44,7 +44,9 @@ struct LocalSlotsBacking { TaskSignature const &); TensorSlotsBacking construct_tensor_slots_backing(OpTaskBinding const &, layer_guid_t const &) const; - TensorSlotsBacking construct_tensor_slots_backing(TaskBinding const &) const; + TensorSlotsBacking + construct_tensor_slots_backing(TaskBinding const &, + std::optional const &) const; ArgSlotsBacking construct_arg_slots_backing(OpTaskBinding const &, layer_guid_t const &) const; ArgSlotsBacking construct_arg_slots_backing(TaskBinding const &) const; @@ -53,24 +55,27 @@ struct LocalSlotsBacking { ConcreteArgSpec resolve_op_arg_ref_spec(OpArgRefSpec const &, layer_guid_t const &) const; - GenericTensorAccessorW const &get_tensor_backing(UnifiedTensorGuid const &, - IsGrad) const; + GenericTensorAccessorW const & + get_tensor_backing(TensorType const &, + reduced_tensor_t const &, + std::optional const &) const; - bool is_tensor_allocated(tensor_guid_t const &) const; - bool is_gradient_tensor_allocated(tensor_guid_t const &) const; + bool is_forward_tensor_allocated(LayerTensorKey const &) const; + bool is_non_graph_tensor_allocated(reduced_tensor_t const &) const; public: // tensors - TensorBackingMap tensor_mapping; - TensorBackingMap gradient_tensor_mapping; - NonGraphTensorBackingMap optimizer_tensor_mapping; - std::unordered_map> + LayerTensorBackingMap tensor_mapping; + LayerTensorBackingMap gradient_tensor_mapping; + LayerTensorBackingMap optimizer_tensor_mapping; + TensorBackingMap non_graph_tensor_mapping; + std::unordered_map> input_tensor_slots; - std::unordered_map> + std::unordered_map> weight_tensor_slots; - std::unordered_map> + std::unordered_map> output_tensor_slots; - std::unordered_map> + std::unordered_map> weight_optimizer_tensor_guids; // arguments diff --git a/lib/local-execution/include/local-execution/local_task_argument_accessor.h b/lib/local-execution/include/local-execution/local_task_argument_accessor.h index 1e1516a0de..db0e98c2b1 100644 --- a/lib/local-execution/include/local-execution/local_task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/local_task_argument_accessor.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H -#include "local-execution/slot_grad_id.dtg.h" +#include "local-execution/slot_tensor_type_id.dtg.h" #include "local-execution/task_argument_accessor.h" #include #include @@ -9,7 +9,7 @@ namespace FlexFlow { using TensorSlotsBacking = std::unordered_map< - SlotGradId, + SlotTensorTypeId, std::variant>>; using ArgSlotsBacking = std::unordered_map; @@ -25,9 +25,9 @@ struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor { GenericTensorAccessor get_tensor(slot_id_t slot, Permissions priv, - IsGrad is_grad) const override; + TensorType tensor_type) const override; VariadicGenericTensorAccessor get_variadic_tensor( - slot_id_t slot, Permissions priv, IsGrad is_grad) const override; + slot_id_t slot, Permissions priv, TensorType tensor_type) const override; Allocator get_allocator() const override; @@ -40,7 +40,7 @@ struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor { }; using TensorSlotsBackingWithoutAddresses = std::unordered_map< - SlotGradId, + SlotTensorTypeId, std::variant, std::vector>>>; diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h index 6dfa8ad443..cbab4bf031 100644 --- a/lib/local-execution/include/local-execution/local_training_backing.h +++ b/lib/local-execution/include/local-execution/local_training_backing.h @@ -15,7 +15,8 @@ using PerLayerElapsedTime = struct LocalTrainingBacking { LocalTrainingBacking(Allocator const &, ComputationGraph const &, - TensorBackingMap const &, + LayerTensorBackingMap const &allocated_forward_tensors, + TensorBackingMap const &allocated_non_graph_tensors, RuntimeArgConfig const &); void register_and_allocate_layer(layer_guid_t const &); void allocate_layer_optimizer_tensors(layer_guid_t const &, @@ -24,17 +25,17 @@ struct LocalTrainingBacking { void execute_init(layer_guid_t const &); std::optional execute_forward(layer_guid_t const &); void compute_loss(LossAttrs const &loss_attrs, - tensor_guid_t const &logit_tensor, - tensor_guid_t const &label_tensor); + reduced_tensor_t const &logit_tensor, + reduced_tensor_t const &label_tensor); std::optional execute_backward(layer_guid_t const &); void execute_update(layer_guid_t const &, OptimizerAttrs const &); - TaskArgumentAccessor get_task_arg_accessor(TaskInvocation const &) const; + TaskArgumentAccessor + get_task_arg_accessor(TaskInvocation const &, + std::optional const &) const; TaskArgumentAccessor get_op_task_arg_accessor(OpTaskInvocation const &, layer_guid_t const &) const; - void insert_tensor(tensor_guid_t const &, GenericTensorAccessorW const &); - private: DeviceSpecificDeviceStates call_init_task_impl(task_id_t, TaskArgumentAccessor const &); diff --git a/lib/local-execution/include/local-execution/loss_functions.h b/lib/local-execution/include/local-execution/loss_functions.h index 2298115d5d..4ce74da766 100644 --- a/lib/local-execution/include/local-execution/loss_functions.h +++ b/lib/local-execution/include/local-execution/loss_functions.h @@ -26,7 +26,7 @@ namespace FlexFlow { TaskImplFunction get_loss_bwd_task_impl(); TaskSignature get_loss_bwd_signature(); TaskInvocation - backward(LossAttrs const &, tensor_guid_t logit, tensor_guid_t label); + backward(LossAttrs const &, reduced_tensor_t logit, reduced_tensor_t label); } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h index 14473ff26e..5cc13f0b40 100644 --- a/lib/local-execution/include/local-execution/model_training_instance.h +++ b/lib/local-execution/include/local-execution/model_training_instance.h @@ -12,11 +12,12 @@ using PerLayerElapsedTime = struct ModelTrainingInstance { ModelTrainingInstance(Allocator const &, ComputationGraph const &, - TensorBackingMap const &, + LayerTensorBackingMap const &allocated_forward_tensors, + TensorBackingMap const &allocated_non_graph_tensors, RuntimeArgConfig const &, LossAttrs const &, - tensor_guid_t const & logit_tensor, - tensor_guid_t const & label_tensor, + reduced_tensor_t const &logit_tensor, + reduced_tensor_t const &label_tensor, OptimizerAttrs const &); void execute_init(); @@ -27,11 +28,11 @@ struct ModelTrainingInstance { ComputationGraph computation_graph; LocalTrainingBacking training_backing; LossAttrs loss_attrs; - tensor_guid_t logit_tensor; - tensor_guid_t label_tensor; + reduced_tensor_t logit_tensor; + reduced_tensor_t label_tensor; OptimizerAttrs optimizer_attrs; }; -} +} // namespace FlexFlow #endif diff --git a/lib/local-execution/include/local-execution/op_task_invocation.h b/lib/local-execution/include/local-execution/op_task_invocation.h index 0f351c3a0e..6484981ebf 100644 --- a/lib/local-execution/include/local-execution/op_task_invocation.h +++ b/lib/local-execution/include/local-execution/op_task_invocation.h @@ -10,7 +10,7 @@ #include "local-execution/op_tensor_spec.h" #include "local-execution/profiling.h" #include "local-execution/runtime_arg_ref.h" -#include "local-execution/slot_grad_id.dtg.h" +#include "local-execution/slot_tensor_type_id.dtg.h" #include "local-execution/task_id_t.dtg.h" #include "local-execution/variadic_tensor_ref.h" #include @@ -84,14 +84,14 @@ struct OpTaskBinding { bool operator==(OpTaskBinding const &other) const; bool operator!=(OpTaskBinding const &other) const; - std::unordered_map const & + std::unordered_map const & get_tensor_bindings() const; std::unordered_map const &get_arg_bindings() const; void bind_from_forward(OpTaskBinding const &fwd); private: - std::unordered_map tensor_bindings; + std::unordered_map tensor_bindings; std::unordered_map arg_bindings; private: diff --git a/lib/local-execution/include/local-execution/op_tensor_slot_spec.struct.toml b/lib/local-execution/include/local-execution/op_tensor_slot_spec.struct.toml index 590dbe6362..54638a7eb6 100644 --- a/lib/local-execution/include/local-execution/op_tensor_slot_spec.struct.toml +++ b/lib/local-execution/include/local-execution/op_tensor_slot_spec.struct.toml @@ -11,7 +11,7 @@ includes = [ "local-execution/slot_id_t.dtg.h", "local-execution/slot_type.dtg.h", "local-execution/tensor_role.dtg.h", - "local-execution/is_grad.dtg.h", + "local-execution/tensor_type.dtg.h", "local-execution/op_slot_options.dtg.h", ] @@ -28,8 +28,8 @@ name = "tensor_role" type = "::FlexFlow::TensorRole" [[fields]] -name = "is_grad" -type = "::FlexFlow::IsGrad" +name = "tensor_type" +type = "::FlexFlow::TensorType" [[fields]] name = "slot_option" diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h index acf9b8a550..2eb480a0c1 100644 --- a/lib/local-execution/include/local-execution/optimizer.h +++ b/lib/local-execution/include/local-execution/optimizer.h @@ -14,21 +14,21 @@ namespace FlexFlow { TaskSignature get_update_signature(OptimizerAttrs const &); TaskInvocation get_update_invocation( OptimizerAttrs const &, - tensor_guid_t const &weight, - std::vector const &grad_buffer_tensors); + reduced_tensor_t const &weight, + std::vector const &grad_buffer_tensors); TaskImplFunction get_update_task_impl(OptimizerAttrs const &); TaskSignature get_sgd_update_signature(); TaskInvocation sgd_update(SGDOptimizerAttrs const &, - tensor_guid_t const &weight, - non_graph_tensor_guid_t const &sgd_v); + reduced_tensor_t const &weight, + reduced_tensor_t const &sgd_v); TaskImplFunction get_sgd_update_task_impl(); TaskSignature get_adam_update_signature(); TaskInvocation adam_update(AdamOptimizerAttrs const &, - tensor_guid_t const &weight, - non_graph_tensor_guid_t const &adam_v, - non_graph_tensor_guid_t const &adam_m); + reduced_tensor_t const &weight, + reduced_tensor_t const &adam_v, + reduced_tensor_t const &adam_m); TaskImplFunction get_adam_update_task_impl(); } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/reduced_tensor_t.struct.toml b/lib/local-execution/include/local-execution/reduced_tensor_t.struct.toml new file mode 100644 index 0000000000..726249c970 --- /dev/null +++ b/lib/local-execution/include/local-execution/reduced_tensor_t.struct.toml @@ -0,0 +1,13 @@ +namespace = "FlexFlow" +name = "reduced_tensor_t" +features = [ + "eq", + "ord", + "hash", + "fmt", +] + + +[[fields]] +name = "raw_index" +type = "int" diff --git a/lib/local-execution/include/local-execution/slot_grad_id.struct.toml b/lib/local-execution/include/local-execution/slot_tensor_type_id.struct.toml similarity index 62% rename from lib/local-execution/include/local-execution/slot_grad_id.struct.toml rename to lib/local-execution/include/local-execution/slot_tensor_type_id.struct.toml index 256091d272..b3b3a320c7 100644 --- a/lib/local-execution/include/local-execution/slot_grad_id.struct.toml +++ b/lib/local-execution/include/local-execution/slot_tensor_type_id.struct.toml @@ -1,5 +1,5 @@ namespace = "FlexFlow" -name = "SlotGradId" +name = "SlotTensorTypeId" features = [ "eq", "ord", @@ -8,7 +8,7 @@ features = [ ] includes = [ - "local-execution/is_grad.dtg.h", + "local-execution/tensor_type.dtg.h", "local-execution/slot_id_t.dtg.h", ] @@ -17,5 +17,5 @@ name = "slot_id" type = "::FlexFlow::slot_id_t" [[fields]] -name = "is_grad" -type = "::FlexFlow::IsGrad" +name = "tensor_type" +type = "::FlexFlow::TensorType" diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h index 54c8dfc5f1..29d5fb8fbe 100644 --- a/lib/local-execution/include/local-execution/task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/task_argument_accessor.h @@ -8,6 +8,7 @@ namespace FlexFlow { struct TaskArgumentAccessor { + // arguments template T const &get_argument(slot_id_t slot) const { if constexpr (PerDeviceOpState::IsPartOfPerDeviceOpState_v) { @@ -24,6 +25,7 @@ struct TaskArgumentAccessor { return this->get_argument(slot_id_t{slot}); } + // tensors template privilege_mode_to_accessor get_tensor(int slot) const { return this->get_tensor(slot_id_t{slot}); @@ -32,7 +34,7 @@ struct TaskArgumentAccessor { template privilege_mode_to_accessor get_tensor(slot_id_t slot) const { return std::get>( - this->ptr->get_tensor(slot, PRIV, IsGrad::NO)); + this->ptr->get_tensor(slot, PRIV, TensorType::FORWARD)); } template @@ -43,9 +45,32 @@ struct TaskArgumentAccessor { template privilege_mode_to_accessor get_tensor_grad(slot_id_t slot) const { return std::get>( - this->ptr->get_tensor(slot, PRIV, IsGrad::YES)); + this->ptr->get_tensor(slot, PRIV, TensorType::GRADIENT)); } + template + privilege_mode_to_accessor get_optimizer_tensor(int slot) const { + return this->get_tensor_grad(slot_id_t{slot}); + } + + template + privilege_mode_to_accessor get_optimizer_tensor(slot_id_t slot) const { + return std::get>( + this->ptr->get_tensor(slot, PRIV, TensorType::OPTIMIZER)); + } + + template + privilege_mode_to_accessor get_non_graph_tensor(int slot) const { + return this->get_tensor_grad(slot_id_t{slot}); + } + + template + privilege_mode_to_accessor get_non_graph_tensor(slot_id_t slot) const { + return std::get>( + this->ptr->get_tensor(slot, PRIV, TensorType::NON_GRAPH)); + } + + // variadic tensors template std::vector> get_variadic_tensor(int slot) const { @@ -56,7 +81,7 @@ struct TaskArgumentAccessor { std::vector> get_variadic_tensor(slot_id_t slot) const { return std::get>>( - this->ptr->get_variadic_tensor(slot, PRIV, IsGrad::NO)); + this->ptr->get_variadic_tensor(slot, PRIV, TensorType::FORWARD)); } template @@ -69,7 +94,33 @@ struct TaskArgumentAccessor { std::vector> get_variadic_tensor_grad(slot_id_t slot) const { return std::get>>( - this->ptr->get_variadic_tensor(slot, PRIV, IsGrad::YES)); + this->ptr->get_variadic_tensor(slot, PRIV, TensorType::GRADIENT)); + } + + template + std::vector> + get_variadic_optimizer_tensor(int slot) const { + return this->get_variadic_tensor_grad(slot_id_t{slot}); + } + + template + std::vector> + get_variadic_optimizer_tensor(slot_id_t slot) const { + return std::get>>( + this->ptr->get_variadic_tensor(slot, PRIV, TensorType::OPTIMIZER)); + } + + template + std::vector> + get_variadic_non_graph_tensor(int slot) const { + return this->get_variadic_tensor_grad(slot_id_t{slot}); + } + + template + std::vector> + get_variadic_non_graph_tensor(slot_id_t slot) const { + return std::get>>( + this->ptr->get_variadic_tensor(slot, PRIV, TensorType::NON_GRAPH)); } Allocator get_allocator() const { diff --git a/lib/local-execution/include/local-execution/task_binding.h b/lib/local-execution/include/local-execution/task_binding.h index 96c96473e4..93461e2e55 100644 --- a/lib/local-execution/include/local-execution/task_binding.h +++ b/lib/local-execution/include/local-execution/task_binding.h @@ -1,12 +1,12 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_BINDING_H #define _FLEXFLOW_LOCAL_EXECUTION_TASK_BINDING_H -#include "local-execution/slot_grad_id.dtg.h" +#include "local-execution/reduced_tensor_t.dtg.h" #include "local-execution/slot_id_t.dtg.h" +#include "local-execution/slot_tensor_type_id.dtg.h" #include "local-execution/task_arg_spec.dtg.h" #include "local-execution/task_id_t.dtg.h" #include "local-execution/task_signature.dtg.h" -#include "local-execution/tensor_guid_spec.dtg.h" #include "utils/hash/unordered_map.h" namespace FlexFlow { @@ -14,8 +14,8 @@ namespace FlexFlow { struct TaskBinding { TaskBinding() = default; - void bind(int, TensorGuidSpec const &); - void bind(slot_id_t, TensorGuidSpec const &); + void bind(int, TensorType const &, reduced_tensor_t const &); + void bind(slot_id_t, TensorType const &, reduced_tensor_t const &); template void bind_arg(int name, T const &t) { @@ -40,12 +40,12 @@ struct TaskBinding { bool operator==(TaskBinding const &other) const; bool operator!=(TaskBinding const &other) const; - std::unordered_map const & + std::unordered_map const & get_tensor_bindings() const; std::unordered_map const &get_arg_bindings() const; private: - std::unordered_map tensor_bindings; + std::unordered_map tensor_bindings; std::unordered_map arg_bindings; private: diff --git a/lib/local-execution/include/local-execution/task_signature.h b/lib/local-execution/include/local-execution/task_signature.h index 6da69f2441..b10edce6d4 100644 --- a/lib/local-execution/include/local-execution/task_signature.h +++ b/lib/local-execution/include/local-execution/task_signature.h @@ -10,11 +10,11 @@ TaskSignature make_empty_task_signature(); void add_slot(TaskSignature &, int name, - IsGrad, + TensorType, SlotType slot_type = SlotType::TENSOR); void add_slot(TaskSignature &, slot_id_t name, - IsGrad, + TensorType, SlotType slot_type = SlotType::TENSOR); template diff --git a/lib/local-execution/include/local-execution/task_signature.struct.toml b/lib/local-execution/include/local-execution/task_signature.struct.toml index ac408a7b68..7efb0c658a 100644 --- a/lib/local-execution/include/local-execution/task_signature.struct.toml +++ b/lib/local-execution/include/local-execution/task_signature.struct.toml @@ -7,7 +7,7 @@ features = [ ] includes = [ - "local-execution/tensor_guid_slot_spec.dtg.h", + "local-execution/tensor_type_slot_spec.dtg.h", "local-execution/slot_id_t.dtg.h", "", "" @@ -30,4 +30,4 @@ type = "std::unordered_map<::FlexFlow::slot_id_t, std::type_index>" [[fields]] name = "tensor_guid_slots" -type = "std::unordered_map<::FlexFlow::slot_id_t, ::FlexFlow::TensorGuidSlotSpec>" +type = "std::unordered_map<::FlexFlow::slot_id_t, ::FlexFlow::TensorTypeSlotSpec>" diff --git a/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml b/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml deleted file mode 100644 index 9b7e9c14f9..0000000000 --- a/lib/local-execution/include/local-execution/tensor_guid_slot_spec.struct.toml +++ /dev/null @@ -1,22 +0,0 @@ -namespace = "FlexFlow" -name = "TensorGuidSlotSpec" -features = [ - "eq", - "fmt", - "hash", - "ord", -] - -includes = [ - "local-execution/slot_type.dtg.h", - "local-execution/is_grad.dtg.h", -] - -[[fields]] -name = "slot_type" -type = "::FlexFlow::SlotType" - -[[fields]] -name = "is_grad" -type = "::FlexFlow::IsGrad" - diff --git a/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml b/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml deleted file mode 100644 index 1d147f60e5..0000000000 --- a/lib/local-execution/include/local-execution/tensor_guid_spec.struct.toml +++ /dev/null @@ -1,23 +0,0 @@ -namespace = "FlexFlow" -name = "TensorGuidSpec" -features = [ - "eq", - "fmt", - "hash", - "ord" -] - -includes = [ - "pcg/tensor_guid_t.dtg.h", - "local-execution/is_grad.dtg.h", - "local-execution/unified_tensor_guid.dtg.h" -] - -[[fields]] -name = "tensor_guid" -type = "::FlexFlow::UnifiedTensorGuid" - -[[fields]] -name = "is_grad" -type = "::FlexFlow::IsGrad" - diff --git a/lib/local-execution/include/local-execution/tensor_reduction.h b/lib/local-execution/include/local-execution/tensor_reduction.h new file mode 100644 index 0000000000..eb55b07ee4 --- /dev/null +++ b/lib/local-execution/include/local-execution/tensor_reduction.h @@ -0,0 +1,15 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_TENSOR_REDUCTION_H +#define _FLEXFLOW_LOCAL_EXECUTION_TENSOR_REDUCTION_H + +#include "local-execution/reduced_tensor_t.dtg.h" +#include "pcg/tensor_guid_t.dtg.h" + +namespace FlexFlow { + +reduced_tensor_t lower(tensor_guid_t const &); + +std::vector lower(std::vector const &); + +} // namespace FlexFlow + +#endif diff --git a/lib/local-execution/include/local-execution/tensor_type.enum.toml b/lib/local-execution/include/local-execution/tensor_type.enum.toml new file mode 100644 index 0000000000..31ce5ba83a --- /dev/null +++ b/lib/local-execution/include/local-execution/tensor_type.enum.toml @@ -0,0 +1,20 @@ +namespace = "FlexFlow" +name = "TensorType" +features = [ + "hash", + "fmt", + "rapidcheck", + "json", +] + +[[values]] +name = "NON_GRAPH" + +[[values]] +name = "FORWARD" + +[[values]] +name = "GRADIENT" + +[[values]] +name = "OPTIMIZER" diff --git a/lib/local-execution/include/local-execution/tensor_type_slot_spec.struct.toml b/lib/local-execution/include/local-execution/tensor_type_slot_spec.struct.toml new file mode 100644 index 0000000000..ceba809474 --- /dev/null +++ b/lib/local-execution/include/local-execution/tensor_type_slot_spec.struct.toml @@ -0,0 +1,26 @@ +namespace = "FlexFlow" +name = "TensorTypeSlotSpec" +features = [ + "eq", + "fmt", + "hash", + "ord", +] + +includes = [ + "local-execution/slot_type.dtg.h", + "local-execution/slot_id_t.dtg.h", + "local-execution/tensor_type.dtg.h", +] + +[[fields]] +name = "slot_id" +type = "::FlexFlow::slot_id_t" + +[[fields]] +name = "tensor_type" +type = "::FlexFlow::TensorType" + +[[fields]] +name = "slot_type" +type = "::FlexFlow::SlotType" diff --git a/lib/local-execution/include/local-execution/unified_tensor_guid.variant.toml b/lib/local-execution/include/local-execution/unified_tensor_guid.variant.toml deleted file mode 100644 index 3d2cd8e45f..0000000000 --- a/lib/local-execution/include/local-execution/unified_tensor_guid.variant.toml +++ /dev/null @@ -1,21 +0,0 @@ -namespace = "FlexFlow" -name = "UnifiedTensorGuid" -features = [ - "eq", - "ord", - "hash", - "fmt", -] - -includes = [ - "pcg/tensor_guid_t.dtg.h", - "local-execution/non_graph_tensor_guid_t.dtg.h", -] - -[[values]] -type = "::FlexFlow::tensor_guid_t" -key = "tensor_guid" - -[[values]] -type = "::FlexFlow::non_graph_tensor_guid_t" -key = "non_graph_tensor_guid" diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc index 25abc72567..f1bb5a9a5b 100644 --- a/lib/local-execution/src/local_slots_backing.cc +++ b/lib/local-execution/src/local_slots_backing.cc @@ -1,4 +1,5 @@ #include "local-execution/local_slots_backing.h" +#include "local-execution/tensor_reduction.h" #include "op-attrs/parallel_tensor_shape.h" #include "pcg/computation_graph.h" #include "utils/containers/contains_key.h" @@ -7,9 +8,12 @@ namespace FlexFlow { -LocalSlotsBacking::LocalSlotsBacking(TensorBackingMap const &allocated_tensors, - RuntimeArgConfig const &runtime_arg_config) - : tensor_mapping(allocated_tensors), +LocalSlotsBacking::LocalSlotsBacking( + LayerTensorBackingMap const &allocated_forward_tensors, + TensorBackingMap const &allocated_non_graph_tensors, + RuntimeArgConfig const &runtime_arg_config) + : tensor_mapping(allocated_forward_tensors), + non_graph_tensor_mapping(allocated_non_graph_tensors), runtime_arg_config(runtime_arg_config){}; void LocalSlotsBacking::add_per_device_op_state( @@ -18,13 +22,6 @@ void LocalSlotsBacking::add_per_device_op_state( this->per_device_op_states.insert({op_guid, device_state}); } -void LocalSlotsBacking::insert_into_tensor_mapping( - tensor_guid_t const &tensor, GenericTensorAccessorW const &tensor_backing) { - if (!contains_key(this->tensor_mapping, tensor)) { - this->tensor_mapping.insert({tensor, tensor_backing}); - } -} - void LocalSlotsBacking::allocate_layer_tensors( layer_guid_t const &layer_guid, ComputationGraph const &computation_graph, @@ -46,15 +43,15 @@ void LocalSlotsBacking::allocate_tensors_by_role( switch (role) { case TensorRole::INPUT: tensors = get_incoming_inputs(computation_graph, layer_guid); - this->input_tensor_slots.insert({layer_guid, tensors}); + this->input_tensor_slots.insert({layer_guid, lower(tensors)}); break; case TensorRole::WEIGHT: tensors = get_incoming_weights(computation_graph, layer_guid); - this->weight_tensor_slots.insert({layer_guid, tensors}); + this->weight_tensor_slots.insert({layer_guid, lower(tensors)}); break; case TensorRole::OUTPUT: tensors = get_outgoing_tensors(computation_graph, layer_guid); - this->output_tensor_slots.insert({layer_guid, tensors}); + this->output_tensor_slots.insert({layer_guid, lower(tensors)}); break; default: throw mk_runtime_error("Invalid tensor role, got {}", role); @@ -62,19 +59,22 @@ void LocalSlotsBacking::allocate_tensors_by_role( for (tensor_guid_t const &tensor : tensors) { TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor); + reduced_tensor_t reduced_tensor = lower(tensor); + LayerTensorKey layer_tensor_key = + LayerTensorKey{layer_guid, reduced_tensor}; // tensor allocation - if (!is_tensor_allocated(tensor)) { + if (!is_forward_tensor_allocated(layer_tensor_key)) { GenericTensorAccessorW tensor_backing = allocator.allocate_tensor(tensor_attrs.shape); - this->tensor_mapping.insert({tensor, tensor_backing}); + this->tensor_mapping.insert({layer_tensor_key, tensor_backing}); } // gradient tensor allocation - if (tensor_attrs.create_gradients == CreateGrad::YES && - !is_gradient_tensor_allocated(tensor)) { + if (tensor_attrs.create_gradients == CreateGrad::YES) { GenericTensorAccessorW gradient_tensor_backing = allocator.allocate_tensor(tensor_attrs.shape); - this->gradient_tensor_mapping.insert({tensor, gradient_tensor_backing}); + this->gradient_tensor_mapping.insert( + {layer_tensor_key, gradient_tensor_backing}); } } } @@ -85,53 +85,52 @@ void LocalSlotsBacking::allocate_optimizer_tensors( ComputationGraph const &cg, Allocator &allocator, TaskSignature const &sig) { - GenericTensorAccessorW weight_backing = - get_tensor_backing(UnifiedTensorGuid{weight}, IsGrad::NO); + GenericTensorAccessorW weight_backing = this->get_tensor_backing( + TensorType::FORWARD, lower(weight), weight_layer); int num_grad_buffer_tensors = sig.tensor_guid_slots.size() - 2; // ignore 2 (weight and weight_grad) - std::vector grad_buffer_tensors; + std::vector optimizer_buffer_tensors; for (int i = 0; i < num_grad_buffer_tensors; ++i) { - non_graph_tensor_guid_t buffer_tensor_guid = non_graph_tensor_guid_t{i}; + reduced_tensor_t buffer_tensor = reduced_tensor_t{i}; GenericTensorAccessorW buffer_backing = allocator.allocate_tensor( get_tensor_shape(weight_backing.shape, weight_backing.data_type)); - this->optimizer_tensor_mapping.insert({buffer_tensor_guid, buffer_backing}); - grad_buffer_tensors.push_back(buffer_tensor_guid); + this->optimizer_tensor_mapping.insert( + {LayerTensorKey{weight_layer, buffer_tensor}, buffer_backing}); + optimizer_buffer_tensors.push_back(buffer_tensor); } this->weight_optimizer_tensor_guids.insert( - {weight_layer, grad_buffer_tensors}); + {weight_layer, optimizer_buffer_tensors}); } -bool LocalSlotsBacking::is_tensor_allocated( - tensor_guid_t const &tensor_id) const { - return contains_key(this->tensor_mapping, tensor_id); +bool LocalSlotsBacking::is_forward_tensor_allocated( + LayerTensorKey const &layer_tensor_id) const { + return contains_key(this->tensor_mapping, layer_tensor_id); } -bool LocalSlotsBacking::is_gradient_tensor_allocated( - tensor_guid_t const &tensor_id) const { - return contains_key(this->gradient_tensor_mapping, tensor_id); +bool LocalSlotsBacking::is_non_graph_tensor_allocated( + reduced_tensor_t const &tensor_id) const { + return contains_key(this->non_graph_tensor_mapping, tensor_id); } -GenericTensorAccessorW const & - LocalSlotsBacking::get_tensor_backing(UnifiedTensorGuid const &tensor_id, - IsGrad is_grad) const { - if (tensor_id.has()) { - tensor_guid_t graph_tensor_guid = tensor_id.get(); - switch (is_grad) { - case IsGrad::NO: - assert(contains_key(this->tensor_mapping, graph_tensor_guid)); - return this->tensor_mapping.at(graph_tensor_guid); - case IsGrad::YES: - assert(contains_key(this->gradient_tensor_mapping, graph_tensor_guid)); - return this->gradient_tensor_mapping.at(graph_tensor_guid); - default: - throw mk_runtime_error(fmt::format( - "IsGrad should only have YES or NO, received {}", is_grad)); - } - } else { - non_graph_tensor_guid_t non_graph_tensor_guid = - tensor_id.get(); - assert(contains_key(this->optimizer_tensor_mapping, non_graph_tensor_guid)); - return this->optimizer_tensor_mapping.at(non_graph_tensor_guid); +GenericTensorAccessorW const &LocalSlotsBacking::get_tensor_backing( + TensorType const &tensor_type, + reduced_tensor_t const &tensor_id, + std::optional const &layer_guid) const { + switch (tensor_type) { + case TensorType::FORWARD: + return this->tensor_mapping.at( + LayerTensorKey{layer_guid.value(), tensor_id}); + case TensorType::NON_GRAPH: + return this->non_graph_tensor_mapping.at(tensor_id); + case TensorType::GRADIENT: + return this->gradient_tensor_mapping.at( + LayerTensorKey{layer_guid.value(), tensor_id}); + case TensorType::OPTIMIZER: + return this->optimizer_tensor_mapping.at( + LayerTensorKey{layer_guid.value(), tensor_id}); + default: + throw mk_runtime_error( + fmt::format("Invalid tensor type {}", tensor_type)); } } @@ -140,9 +139,9 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( TensorSlotsBacking mapping; for (auto const &tensor_binding : binding.get_tensor_bindings()) { - SlotGradId slot_grad_id = tensor_binding.first; + SlotTensorTypeId slot_grad_id = tensor_binding.first; OpTensorSpec tensor_spec = tensor_binding.second; - std::vector tensor_guids; + std::vector tensor_guids; int weight_adjusted_idx = 0; switch (tensor_spec.role) { case TensorRole::WEIGHT: @@ -162,26 +161,25 @@ TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( fmt::format("Invalid TensorRole {}", tensor_spec.role)); } - IsGrad is_grad = slot_grad_id.is_grad; - GenericTensorAccessorW tensor_backing = this->get_tensor_backing( - UnifiedTensorGuid{tensor_guids.at(tensor_spec.idx)}, is_grad); - - mapping.insert({slot_grad_id, tensor_backing}); + mapping.insert({slot_grad_id, + this->get_tensor_backing(slot_grad_id.tensor_type, + tensor_guids.at(tensor_spec.idx), + op_guid)}); } return mapping; } TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( - TaskBinding const &binding) const { + TaskBinding const &binding, + std::optional const &layer_guid) const { TensorSlotsBacking mapping; for (auto const &tensor_binding : binding.get_tensor_bindings()) { - SlotGradId slot_grad_id = tensor_binding.first; - TensorGuidSpec tensor_spec = tensor_binding.second; - + reduced_tensor_t tensor_id = tensor_binding.second; + SlotTensorTypeId slot_tensor_type_id = tensor_binding.first; GenericTensorAccessorW accessor = this->get_tensor_backing( - UnifiedTensorGuid{tensor_spec.tensor_guid}, slot_grad_id.is_grad); - mapping.insert({slot_grad_id, accessor}); + slot_tensor_type_id.tensor_type, tensor_id, layer_guid); + mapping.insert({slot_tensor_type_id, accessor}); } return mapping; @@ -229,13 +227,14 @@ ConcreteArgSpec LocalSlotsBacking::resolve_op_arg_ref_spec( op_arg_ref_spec.get_ref_type().get(); assert(contains_key(this->input_tensor_slots, op_guid)); - std::vector input_tensor_guids = + std::vector input_tensor_guids = this->input_tensor_slots.at(op_guid); assert(input_tensor_guids.size() > index_op_arg_ref.idx); - GenericTensorAccessorW tensor_backing = this->get_tensor_backing( - UnifiedTensorGuid{input_tensor_guids.at(index_op_arg_ref.idx)}, - IsGrad::NO); + GenericTensorAccessorW tensor_backing = + this->get_tensor_backing(TensorType::FORWARD, + input_tensor_guids.at(index_op_arg_ref.idx), + op_guid); ParallelTensorShape shape = lift_to_parallel( get_tensor_shape(tensor_backing.shape, tensor_backing.data_type)); return ConcreteArgSpec::create(shape); diff --git a/lib/local-execution/src/local_task_argument_accessor.cc b/lib/local-execution/src/local_task_argument_accessor.cc index 5d0156201e..75479a1f88 100644 --- a/lib/local-execution/src/local_task_argument_accessor.cc +++ b/lib/local-execution/src/local_task_argument_accessor.cc @@ -19,10 +19,10 @@ ConcreteArgSpec const & } GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor( - slot_id_t slot, Permissions priv, IsGrad is_grad) const { - SlotGradId slot_grad_pair = SlotGradId{slot, is_grad}; + slot_id_t slot, Permissions priv, TensorType tensor_type) const { + SlotTensorTypeId slot_tensor_type = SlotTensorTypeId{slot, tensor_type}; auto tensor_backing = std::get( - this->tensor_slots_backing.at(slot_grad_pair)); + this->tensor_slots_backing.at(slot_tensor_type)); if (priv == Permissions::RO) { GenericTensorAccessorR readonly_tensor_backing = { tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr}; @@ -34,10 +34,10 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor( } } VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor( - slot_id_t slot, Permissions priv, IsGrad is_grad) const { - SlotGradId slot_grad_pair = SlotGradId{slot, is_grad}; + slot_id_t slot, Permissions priv, TensorType tensor_type) const { + SlotTensorTypeId slot_tensor_type = SlotTensorTypeId{slot, tensor_type}; auto variadic_tensor_backing = std::get>( - this->tensor_slots_backing.at(slot_grad_pair)); + this->tensor_slots_backing.at(slot_tensor_type)); if (priv == Permissions::RO) { std::vector readonly_variadic_tensor_backing = {}; for (GenericTensorAccessorW const &tensor_backing : diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index 0cb8146467..e432b1afe9 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -3,6 +3,7 @@ #include "local-execution/optimizer.h" #include "local-execution/task_invocation.h" #include "local-execution/task_signature_impl.h" +#include "local-execution/tensor_reduction.h" #include "pcg/computation_graph.h" #include "pcg/optimizer_attrs.h" #include "utils/containers/contains.h" @@ -15,10 +16,13 @@ namespace FlexFlow { LocalTrainingBacking::LocalTrainingBacking( Allocator const &allocator, ComputationGraph const &computation_graph, - TensorBackingMap const &tensor_backing_mapping, + LayerTensorBackingMap const &allocated_forward_tensors, + TensorBackingMap const &allocated_non_graph_tensors, RuntimeArgConfig const &runtime_arg_config) : allocator(allocator), computation_graph(computation_graph), - local_slots_backing(tensor_backing_mapping, runtime_arg_config), + local_slots_backing(allocated_forward_tensors, + allocated_non_graph_tensors, + runtime_arg_config), task_registry(empty_task_registry()) {} void LocalTrainingBacking::register_and_allocate_layer( @@ -96,15 +100,16 @@ std::optional } void LocalTrainingBacking::compute_loss(LossAttrs const &loss_attrs, - tensor_guid_t const &logit_tensor, - tensor_guid_t const &label_tensor) { - assert(this->local_slots_backing.is_tensor_allocated(logit_tensor) && - this->local_slots_backing.is_tensor_allocated(label_tensor)); + reduced_tensor_t const &logit_tensor, + reduced_tensor_t const &label_tensor) { + assert( + this->local_slots_backing.is_non_graph_tensor_allocated(logit_tensor) && + this->local_slots_backing.is_non_graph_tensor_allocated(label_tensor)); TaskInvocation loss_invocation = backward(loss_attrs, logit_tensor, label_tensor); // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation)); TaskArgumentAccessor loss_accessor = - this->get_task_arg_accessor(loss_invocation); + this->get_task_arg_accessor(loss_invocation, std::nullopt); TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl(); loss_impl_fn.get().function_ptr(loss_accessor); } @@ -130,28 +135,30 @@ void LocalTrainingBacking::execute_update( LayerAttrs layer_attrs = get_layer_attrs(this->computation_graph, node); if (layer_attrs.attrs.has()) { // get tensors - tensor_guid_t weight_tensor = - get_only(get_outgoing_tensors(this->computation_graph, node)); - std::vector grad_buffer_tensors = + reduced_tensor_t weight_tensor = + lower(get_only(get_outgoing_tensors(this->computation_graph, node))); + std::vector optimizer_buffer_tensors = this->local_slots_backing.weight_optimizer_tensor_guids.at(node); // get invocation TaskInvocation invocation = get_update_invocation( - optimizer_attrs, weight_tensor, grad_buffer_tensors); + optimizer_attrs, weight_tensor, optimizer_buffer_tensors); // assert(is_invocation_valid(get_update_signature(attrs), invocation)); // execute update - TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation); + TaskArgumentAccessor accessor = + this->get_task_arg_accessor(invocation, node); TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs); update_impl_fn.get().function_ptr(accessor); } } TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor( - TaskInvocation const &invocation) const { + TaskInvocation const &invocation, + std::optional const &layer_guid) const { TensorSlotsBacking tensor_slots_backing = this->local_slots_backing.construct_tensor_slots_backing( - invocation.binding); + invocation.binding, layer_guid); ArgSlotsBacking arg_slots_backing = this->local_slots_backing.construct_arg_slots_backing(invocation.binding); return TaskArgumentAccessor::create( @@ -171,9 +178,4 @@ TaskArgumentAccessor LocalTrainingBacking::get_op_task_arg_accessor( this->allocator, tensor_slots_backing, arg_slots_backing); } -void LocalTrainingBacking::insert_tensor( - tensor_guid_t const &tensor, GenericTensorAccessorW const &tensor_backing) { - this->local_slots_backing.insert_into_tensor_mapping(tensor, tensor_backing); -} - } // namespace FlexFlow diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc index a37c1d706b..e54841acb5 100644 --- a/lib/local-execution/src/loss_functions.cc +++ b/lib/local-execution/src/loss_functions.cc @@ -24,20 +24,23 @@ enum Slots { LOGIT, LABEL, ATTRS, PROFILING }; TaskSignature get_loss_bwd_signature() { TaskSignature sig = make_empty_task_signature(); - add_slot(sig, LOGIT, IsGrad::NO); - add_slot(sig, LABEL, IsGrad::NO); - add_slot(sig, LOGIT, IsGrad::YES); + add_slot(sig, LOGIT, TensorType::NON_GRAPH); + add_slot(sig, LABEL, TensorType::NON_GRAPH); + add_slot(sig, LOGIT, TensorType::GRADIENT); + add_arg_slot(sig, ATTRS); add_arg_slot(sig, PROFILING); return sig; } -TaskInvocation - backward(LossAttrs const &attrs, tensor_guid_t logit, tensor_guid_t label) { +TaskInvocation backward(LossAttrs const &attrs, + reduced_tensor_t logit, + reduced_tensor_t label) { TaskBinding b; - b.bind(LOGIT, TensorGuidSpec{UnifiedTensorGuid{logit}, IsGrad::NO}); - b.bind(LABEL, TensorGuidSpec{UnifiedTensorGuid{label}, IsGrad::NO}); - b.bind(LOGIT, TensorGuidSpec{UnifiedTensorGuid{logit}, IsGrad::YES}); + b.bind(LOGIT, TensorType::NON_GRAPH, logit); + b.bind(LABEL, TensorType::NON_GRAPH, label); + b.bind(LOGIT, TensorType::GRADIENT, logit); + b.bind_arg(ATTRS, attrs); b.bind_arg(PROFILING, profiling_settings()); diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc index abdced1bb5..5a58e4c524 100644 --- a/lib/local-execution/src/model_training_instance.cc +++ b/lib/local-execution/src/model_training_instance.cc @@ -1,59 +1,77 @@ #include "local-execution/model_training_instance.h" #include "pcg/computation_graph.h" -#include "utils/containers/reversed.h" #include "pcg/optimizer_attrs.h" +#include "utils/containers/reversed.h" namespace FlexFlow { - -ModelTrainingInstance::ModelTrainingInstance(Allocator const & allocator, - ComputationGraph const & computation_graph, - TensorBackingMap const & tensor_backing_map, - RuntimeArgConfig const & runtime_arg_config, - LossAttrs const & loss_attrs, - tensor_guid_t const &logit_tensor, - tensor_guid_t const &label_tensor, - OptimizerAttrs const & optimizer_attrs) - : computation_graph(computation_graph), training_backing(allocator, computation_graph, tensor_backing_map, runtime_arg_config), - loss_attrs(loss_attrs), logit_tensor(logit_tensor), label_tensor(label_tensor), optimizer_attrs(optimizer_attrs) { + +ModelTrainingInstance::ModelTrainingInstance( + Allocator const &allocator, + ComputationGraph const &computation_graph, + LayerTensorBackingMap const &allocated_forward_tensors, + TensorBackingMap const &allocated_non_graph_tensors, + RuntimeArgConfig const &runtime_arg_config, + LossAttrs const &loss_attrs, + reduced_tensor_t const &logit_tensor, + reduced_tensor_t const &label_tensor, + OptimizerAttrs const &optimizer_attrs) + : computation_graph(computation_graph), + training_backing(allocator, + computation_graph, + allocated_forward_tensors, + allocated_non_graph_tensors, + runtime_arg_config), + loss_attrs(loss_attrs), logit_tensor(logit_tensor), + label_tensor(label_tensor), optimizer_attrs(optimizer_attrs) { // allocate each layer's tensors - for (layer_guid_t const & node: topological_ordering(this->computation_graph)) { + for (layer_guid_t const &node : + topological_ordering(this->computation_graph)) { this->training_backing.register_and_allocate_layer(node); - this->training_backing.allocate_layer_optimizer_tensors(node, this->optimizer_attrs); + this->training_backing.allocate_layer_optimizer_tensors( + node, this->optimizer_attrs); } } void ModelTrainingInstance::execute_init() { - for (layer_guid_t const & node: topological_ordering(this->computation_graph)) { + for (layer_guid_t const &node : + topological_ordering(this->computation_graph)) { this->training_backing.execute_init(node); } } PerLayerElapsedTime ModelTrainingInstance::execute_forward() { PerLayerElapsedTime per_layer_elapsed_time; - for (layer_guid_t const & node: topological_ordering(this->computation_graph)) { - std::optional elapsed_time = this->training_backing.execute_forward(node); + for (layer_guid_t const &node : + topological_ordering(this->computation_graph)) { + std::optional elapsed_time = + this->training_backing.execute_forward(node); per_layer_elapsed_time.insert({node, elapsed_time}); } return per_layer_elapsed_time; } PerLayerElapsedTime ModelTrainingInstance::execute_backward() { - this->training_backing.compute_loss(this->loss_attrs, this->logit_tensor, this->label_tensor); - + this->training_backing.compute_loss( + this->loss_attrs, this->logit_tensor, this->label_tensor); + PerLayerElapsedTime per_layer_elapsed_time; - for (layer_guid_t const & node: reversed(topological_ordering(this->computation_graph))) { - std::optional elapsed_time = this->training_backing.execute_backward(node); + for (layer_guid_t const &node : + reversed(topological_ordering(this->computation_graph))) { + std::optional elapsed_time = + this->training_backing.execute_backward(node); per_layer_elapsed_time.insert({node, elapsed_time}); } return per_layer_elapsed_time; } void ModelTrainingInstance::execute_update() { - for (layer_guid_t const & node: topological_ordering(this->computation_graph)) { + for (layer_guid_t const &node : + topological_ordering(this->computation_graph)) { this->training_backing.execute_update(node, this->optimizer_attrs); } - this->optimizer_attrs = get_next_iteration_optimizer_attrs(this->optimizer_attrs); + this->optimizer_attrs = + get_next_iteration_optimizer_attrs(this->optimizer_attrs); } } // namespace FlexFlow diff --git a/lib/local-execution/src/op_task_invocation.cc b/lib/local-execution/src/op_task_invocation.cc index 19c8894b05..81bf185911 100644 --- a/lib/local-execution/src/op_task_invocation.cc +++ b/lib/local-execution/src/op_task_invocation.cc @@ -20,7 +20,8 @@ void OpTaskBinding::bind(int slot, OpTensorSpec const &tensor_spec) { } void OpTaskBinding::bind(slot_id_t slot, OpTensorSpec const &tensor_spec) { - this->tensor_bindings.insert({SlotGradId{slot, IsGrad::NO}, tensor_spec}); + this->tensor_bindings.insert( + {SlotTensorTypeId{slot, TensorType::FORWARD}, tensor_spec}); } void OpTaskBinding::bind_grad(int slot, OpTensorSpec const &tensor_spec) { @@ -28,7 +29,8 @@ void OpTaskBinding::bind_grad(int slot, OpTensorSpec const &tensor_spec) { } void OpTaskBinding::bind_grad(slot_id_t slot, OpTensorSpec const &tensor_spec) { - this->tensor_bindings.insert({SlotGradId{slot, IsGrad::YES}, tensor_spec}); + this->tensor_bindings.insert( + {SlotTensorTypeId{slot, TensorType::GRADIENT}, tensor_spec}); } void OpTaskBinding::insert_arg_spec(slot_id_t name, OpArgSpec const &arg_spec) { @@ -44,13 +46,13 @@ bool OpTaskBinding::operator!=(OpTaskBinding const &other) const { return this->tie() != other.tie(); } -std::tuple const &, +std::tuple const &, std::unordered_map const &> OpTaskBinding::tie() const { return std::tie(this->tensor_bindings, this->arg_bindings); } -std::unordered_map const & +std::unordered_map const & OpTaskBinding::get_tensor_bindings() const { return this->tensor_bindings; } @@ -89,8 +91,8 @@ bool is_tensor_invocation_valid(OpTaskSignature const &sig, OpTaskInvocation const &inv) { auto tensor_bindings = inv.binding.get_tensor_bindings(); for (OpTensorSlotSpec const &op_tensor_slot_spec : sig.get_tensor_slots()) { - SlotGradId tensor_key = - SlotGradId{op_tensor_slot_spec.name, op_tensor_slot_spec.is_grad}; + SlotTensorTypeId tensor_key = SlotTensorTypeId{ + op_tensor_slot_spec.name, op_tensor_slot_spec.tensor_type}; OpTensorSpec op_tensor_spec = tensor_bindings.at(tensor_key); if (is_op_tensor_spec_invalid(op_tensor_slot_spec, op_tensor_spec)) { return false; diff --git a/lib/local-execution/src/op_task_signature.cc b/lib/local-execution/src/op_task_signature.cc index 932b330453..5c8b19265a 100644 --- a/lib/local-execution/src/op_task_signature.cc +++ b/lib/local-execution/src/op_task_signature.cc @@ -12,8 +12,12 @@ void OpTaskSignature::add_input_slot(int name, SlotType slot_type) { } void OpTaskSignature::add_input_slot(slot_id_t name, SlotType slot_type) { - OpTensorSlotSpec op_tensor_slot_spec = OpTensorSlotSpec{ - name, slot_type, TensorRole::INPUT, IsGrad::NO, OpSlotOptions::NECESSARY}; + OpTensorSlotSpec op_tensor_slot_spec = + OpTensorSlotSpec{name, + slot_type, + TensorRole::INPUT, + TensorType::FORWARD, + OpSlotOptions::NECESSARY}; this->op_tensor_slots.insert(op_tensor_slot_spec); } @@ -23,8 +27,12 @@ void OpTaskSignature::add_optional_input_slot(int name, SlotType slot_type) { void OpTaskSignature::add_optional_input_slot(slot_id_t name, SlotType slot_type) { - OpTensorSlotSpec op_tensor_slot_spec = OpTensorSlotSpec{ - name, slot_type, TensorRole::INPUT, IsGrad::NO, OpSlotOptions::OPTIONAL}; + OpTensorSlotSpec op_tensor_slot_spec = + OpTensorSlotSpec{name, + slot_type, + TensorRole::INPUT, + TensorType::FORWARD, + OpSlotOptions::OPTIONAL}; this->op_tensor_slots.insert(op_tensor_slot_spec); } @@ -38,7 +46,7 @@ void OpTaskSignature::add_untrainable_input_slot(slot_id_t name, OpTensorSlotSpec{name, slot_type, TensorRole::INPUT, - IsGrad::NO, + TensorType::FORWARD, OpSlotOptions::UNTRAINABLE}; this->op_tensor_slots.insert(op_tensor_slot_spec); } @@ -54,7 +62,7 @@ void OpTaskSignature::add_optional_untrainable_input_slot(slot_id_t name, OpTensorSlotSpec{name, slot_type, TensorRole::INPUT, - IsGrad::NO, + TensorType::FORWARD, OpSlotOptions::OPTIONAL_UNTRAINABLE}; this->op_tensor_slots.insert(op_tensor_slot_spec); } @@ -68,7 +76,7 @@ void OpTaskSignature::add_output_slot(slot_id_t name, SlotType slot_type) { OpTensorSlotSpec{name, slot_type, TensorRole::OUTPUT, - IsGrad::NO, + TensorType::FORWARD, OpSlotOptions::NECESSARY}; this->op_tensor_slots.insert(op_tensor_slot_spec); } @@ -80,8 +88,12 @@ void OpTaskSignature::add_bwd_optional_output_slot(int name, void OpTaskSignature::add_bwd_optional_output_slot(slot_id_t name, SlotType slot_type) { - OpTensorSlotSpec op_tensor_slot_spec = OpTensorSlotSpec{ - name, slot_type, TensorRole::OUTPUT, IsGrad::NO, OpSlotOptions::OPTIONAL}; + OpTensorSlotSpec op_tensor_slot_spec = + OpTensorSlotSpec{name, + slot_type, + TensorRole::OUTPUT, + TensorType::FORWARD, + OpSlotOptions::OPTIONAL}; this->op_tensor_slots.insert(op_tensor_slot_spec); } @@ -94,7 +106,7 @@ void OpTaskSignature::add_weight_slot(slot_id_t name, SlotType slot_type) { OpTensorSlotSpec{name, slot_type, TensorRole::WEIGHT, - IsGrad::NO, + TensorType::FORWARD, OpSlotOptions::NECESSARY}; this->op_tensor_slots.insert(op_tensor_slot_spec); } @@ -105,8 +117,12 @@ void OpTaskSignature::add_optional_weight_slot(int name, SlotType slot_type) { void OpTaskSignature::add_optional_weight_slot(slot_id_t name, SlotType slot_type) { - OpTensorSlotSpec op_tensor_slot_spec = OpTensorSlotSpec{ - name, slot_type, TensorRole::WEIGHT, IsGrad::NO, OpSlotOptions::OPTIONAL}; + OpTensorSlotSpec op_tensor_slot_spec = + OpTensorSlotSpec{name, + slot_type, + TensorRole::WEIGHT, + TensorType::FORWARD, + OpSlotOptions::OPTIONAL}; this->op_tensor_slots.insert(op_tensor_slot_spec); } @@ -130,7 +146,7 @@ OpTaskSignature infer_bwd_signature(OpTaskSignature const &fwd) { OpTensorSlotSpec{op_tensor_slot_spec.name, op_tensor_slot_spec.slot_type, op_tensor_slot_spec.tensor_role, - IsGrad::YES, + TensorType::GRADIENT, op_tensor_slot_spec.slot_option}; bwd.op_tensor_slots.insert(grad_spec); } diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc index 1e06dee96a..5c0d6c54f2 100644 --- a/lib/local-execution/src/optimizer.cc +++ b/lib/local-execution/src/optimizer.cc @@ -9,9 +9,10 @@ enum Slots { ATTRS, WEIGHT, SGD_V, PROFILING, ADAM_M, ADAM_V, HANDLE }; TaskSignature get_sgd_update_signature() { TaskSignature sig = make_empty_task_signature(); - add_slot(sig, WEIGHT, IsGrad::YES); - add_slot(sig, WEIGHT, IsGrad::NO); - add_slot(sig, SGD_V, IsGrad::YES); + add_slot(sig, WEIGHT, TensorType::FORWARD); + add_slot(sig, WEIGHT, TensorType::GRADIENT); + add_slot(sig, SGD_V, TensorType::OPTIMIZER); + add_arg_slot(sig, ATTRS); add_arg_slot(sig, PROFILING); if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { @@ -21,13 +22,14 @@ TaskSignature get_sgd_update_signature() { } TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs, - tensor_guid_t const &weight, - non_graph_tensor_guid_t const &sgd_v) { + reduced_tensor_t const &weight, + reduced_tensor_t const &sgd_v) { TaskBinding b; - b.bind(WEIGHT, TensorGuidSpec{UnifiedTensorGuid{weight}, IsGrad::YES}); - b.bind(WEIGHT, TensorGuidSpec{UnifiedTensorGuid{weight}, IsGrad::NO}); + b.bind(WEIGHT, TensorType::FORWARD, weight); + b.bind(WEIGHT, TensorType::GRADIENT, weight); + if (attrs.momentum > 0.0f) { - b.bind(SGD_V, TensorGuidSpec{UnifiedTensorGuid{sgd_v}, IsGrad::YES}); + b.bind(SGD_V, TensorType::OPTIMIZER, sgd_v); } b.bind_arg(ATTRS, attrs); b.bind_arg(PROFILING, profiling_settings()); @@ -97,10 +99,11 @@ TaskImplFunction get_sgd_update_task_impl() { TaskSignature get_adam_update_signature() { TaskSignature sig = make_empty_task_signature(); - add_slot(sig, WEIGHT, IsGrad::YES); - add_slot(sig, WEIGHT, IsGrad::NO); - add_slot(sig, ADAM_V, IsGrad::YES); - add_slot(sig, ADAM_M, IsGrad::YES); + add_slot(sig, WEIGHT, TensorType::FORWARD); + add_slot(sig, WEIGHT, TensorType::GRADIENT); + add_slot(sig, ADAM_V, TensorType::OPTIMIZER); + add_slot(sig, ADAM_M, TensorType::OPTIMIZER); + add_arg_slot(sig, ATTRS); add_arg_slot(sig, PROFILING); if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { @@ -110,14 +113,14 @@ TaskSignature get_adam_update_signature() { } TaskInvocation adam_update(AdamOptimizerAttrs const &attrs, - tensor_guid_t const &weight, - non_graph_tensor_guid_t const &adam_v, - non_graph_tensor_guid_t const &adam_m) { + reduced_tensor_t const &weight, + reduced_tensor_t const &adam_v, + reduced_tensor_t const &adam_m) { TaskBinding b; - b.bind(WEIGHT, TensorGuidSpec{UnifiedTensorGuid{weight}, IsGrad::YES}); - b.bind(WEIGHT, TensorGuidSpec{UnifiedTensorGuid{weight}, IsGrad::NO}); - b.bind(ADAM_M, TensorGuidSpec{UnifiedTensorGuid{adam_m}, IsGrad::YES}); - b.bind(ADAM_V, TensorGuidSpec{UnifiedTensorGuid{adam_v}, IsGrad::YES}); + b.bind(WEIGHT, TensorType::FORWARD, weight); + b.bind(WEIGHT, TensorType::GRADIENT, weight); + b.bind(ADAM_M, TensorType::OPTIMIZER, adam_m); + b.bind(ADAM_V, TensorType::OPTIMIZER, adam_v); b.bind_arg(ATTRS, attrs); b.bind_arg(PROFILING, profiling_settings()); @@ -191,8 +194,8 @@ TaskSignature get_update_signature(OptimizerAttrs const &attrs) { TaskInvocation get_update_invocation( OptimizerAttrs const &attrs, - tensor_guid_t const &weight, - std::vector const &grad_buffer_tensors) { + reduced_tensor_t const &weight, + std::vector const &grad_buffer_tensors) { return attrs.visit(overload{ [&](SGDOptimizerAttrs const &s) { return sgd_update(s, weight, grad_buffer_tensors.at(0)); diff --git a/lib/local-execution/src/task_binding.cc b/lib/local-execution/src/task_binding.cc index 45d9d0cdb9..5261eec217 100644 --- a/lib/local-execution/src/task_binding.cc +++ b/lib/local-execution/src/task_binding.cc @@ -4,13 +4,16 @@ namespace FlexFlow { -void TaskBinding::bind(int name, TensorGuidSpec const &tensor_guid_spec) { - this->bind(slot_id_t{name}, tensor_guid_spec); +void TaskBinding::bind(int name, + TensorType const &tensor_type, + reduced_tensor_t const &binding) { + this->bind(slot_id_t{name}, tensor_type, binding); } -void TaskBinding::bind(slot_id_t name, TensorGuidSpec const &tensor_guid_spec) { - this->tensor_bindings.insert( - {SlotGradId{name, tensor_guid_spec.is_grad}, tensor_guid_spec}); +void TaskBinding::bind(slot_id_t name, + TensorType const &tensor_type, + reduced_tensor_t const &binding) { + this->tensor_bindings.insert({SlotTensorTypeId{name, tensor_type}, binding}); } void TaskBinding::insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec) { @@ -26,13 +29,13 @@ bool TaskBinding::operator!=(TaskBinding const &other) const { return this->tie() != other.tie(); } -std::tuple const &, +std::tuple const &, std::unordered_map const &> TaskBinding::tie() const { return std::tie(this->tensor_bindings, this->arg_bindings); } -std::unordered_map const & +std::unordered_map const & TaskBinding::get_tensor_bindings() const { return this->tensor_bindings; } diff --git a/lib/local-execution/src/task_signature.cc b/lib/local-execution/src/task_signature.cc index 27bcbcd266..a608ab8ab8 100644 --- a/lib/local-execution/src/task_signature.cc +++ b/lib/local-execution/src/task_signature.cc @@ -8,17 +8,17 @@ TaskSignature make_empty_task_signature() { void add_slot(TaskSignature &task_signature, int name, - IsGrad is_grad, + TensorType tensor_type, SlotType slot_type) { - add_slot(task_signature, slot_id_t{name}, is_grad, slot_type); + add_slot(task_signature, slot_id_t{name}, tensor_type, slot_type); } void add_slot(TaskSignature &task_signature, slot_id_t name, - IsGrad is_grad, + TensorType tensor_type, SlotType slot_type) { - TensorGuidSlotSpec tensor_guid_slot_spec = - TensorGuidSlotSpec{slot_type, is_grad}; + TensorTypeSlotSpec tensor_guid_slot_spec = + TensorTypeSlotSpec{slot_type, tensor_type}; task_signature.tensor_guid_slots.insert({name, tensor_guid_slot_spec}); } diff --git a/lib/local-execution/src/tensor_reduction.cc b/lib/local-execution/src/tensor_reduction.cc new file mode 100644 index 0000000000..19324509bb --- /dev/null +++ b/lib/local-execution/src/tensor_reduction.cc @@ -0,0 +1,17 @@ +#include "local-execution/tensor_reduction.h" +#include "utils/containers/transform.h" + +namespace FlexFlow { + +reduced_tensor_t lower(tensor_guid_t const &tensor_guid) { + return reduced_tensor_t{tensor_guid.raw_graph_output.idx}; +} + +std::vector + lower(std::vector const &tensor_guids) { + return transform(tensor_guids, [&](tensor_guid_t const &tensor_guid) { + return lower(tensor_guid); + }); +} + +} // namespace FlexFlow diff --git a/lib/op-attrs/include/op-attrs/operator_attrs.h b/lib/op-attrs/include/op-attrs/operator_attrs.h index 11afc5b209..73473d6ac5 100644 --- a/lib/op-attrs/include/op-attrs/operator_attrs.h +++ b/lib/op-attrs/include/op-attrs/operator_attrs.h @@ -1,8 +1,6 @@ #ifndef _OPERATOR_PARAMS_H #define _OPERATOR_PARAMS_H -#include "op-attrs/ops/core.h" -#include "op-attrs/pcg_operator_attrs.dtg.h" #include "local-execution/ops/attention.h" #include "local-execution/ops/batch_matmul.h" #include "local-execution/ops/batch_norm.h" @@ -32,6 +30,8 @@ #include "local-execution/ops/split.h" #include "local-execution/ops/topk.h" #include "local-execution/ops/transpose.h" +#include "op-attrs/ops/core.h" +#include "op-attrs/pcg_operator_attrs.dtg.h" #include "utils/record_formatter.h" #include "utils/variant.h" #include From 0cdfb1a7edd9ea283f678f06950054a701be8600 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 7 Jan 2025 15:14:49 -0800 Subject: [PATCH 23/91] Fixes --- lib/kernels/include/kernels/array_shape.h | 2 - lib/kernels/include/kernels/legion_dim.h | 3 - lib/kernels/src/array_shape.cc | 19 +++--- lib/kernels/src/cuda/ops/concat_kernels.cu | 4 +- lib/kernels/src/legion_dim.cc | 9 --- .../include/local-execution/arg_ref.h | 2 +- .../include/local-execution/concrete_arg.h | 6 +- .../layer_tensor_key.struct.toml | 2 - .../local-execution/local_training_backing.h | 2 +- .../include/local-execution/runtime_arg_ref.h | 12 ---- .../include/local-execution/task_binding.h | 8 +-- .../include/local-execution/task_registry.h | 2 +- .../{ => local-execution}/ops/attention.cc | 0 .../{ => local-execution}/ops/batch_matmul.cc | 0 .../{ => local-execution}/ops/batch_norm.cc | 0 .../src/{ => local-execution}/ops/cast.cc | 0 .../src/{ => local-execution}/ops/combine.cc | 0 .../src/{ => local-execution}/ops/concat.cc | 0 .../src/{ => local-execution}/ops/conv_2d.cc | 0 .../src/{ => local-execution}/ops/dropout.cc | 0 .../ops/element_binary.cc | 0 .../ops/element_unary.cc | 0 .../src/{ => local-execution}/ops/flat.cc | 0 .../src/{ => local-execution}/ops/gather.cc | 0 .../src/{ => local-execution}/ops/input.cc | 0 .../{ => local-execution}/ops/layer_norm.cc | 0 .../src/{ => local-execution}/ops/linear.cc | 0 .../src/{ => local-execution}/ops/noop.cc | 0 .../src/{ => local-execution}/ops/pool_2d.cc | 0 .../src/{ => local-execution}/ops/reduce.cc | 0 .../{ => local-execution}/ops/reduction.cc | 0 .../{ => local-execution}/ops/repartition.cc | 0 .../{ => local-execution}/ops/replicate.cc | 0 .../src/{ => local-execution}/ops/reshape.cc | 0 .../src/{ => local-execution}/ops/reverse.cc | 0 .../src/{ => local-execution}/ops/softmax.cc | 0 .../src/{ => local-execution}/ops/split.cc | 0 .../src/{ => local-execution}/ops/topk.cc | 0 .../{ => local-execution}/ops/transpose.cc | 0 .../src/{ => local-execution}/ops/weight.cc | 0 .../src/local_cost_estimator.cc | 22 ++----- .../src/local_training_backing.cc | 13 +++-- .../src/model_training_instance.cc | 2 +- lib/local-execution/src/task_binding.cc | 13 +++++ lib/local-execution/src/task_registry.cc | 2 +- lib/local-execution/src/task_signature.cc | 2 +- lib/local-execution/src/tensor_reduction.cc | 2 +- .../test/src/test_local_slots_backing.cc | 47 ++++++++------- .../test/src/test_local_task_arg_accessor.cc | 56 +++++++++--------- lib/local-execution/test/src/test_loss_e2e.cc | 35 +++++------ .../test/src/test_update_e2e.cc | 9 +-- .../include/op-attrs/operator_attrs.h | 58 +++++++++---------- lib/pcg/include/pcg/optimizer_attrs.h | 2 +- lib/pcg/src/pcg/optimizer_attrs.cc | 2 +- 54 files changed, 154 insertions(+), 182 deletions(-) rename lib/local-execution/src/{ => local-execution}/ops/attention.cc (100%) rename lib/local-execution/src/{ => local-execution}/ops/batch_matmul.cc (100%) rename lib/local-execution/src/{ => local-execution}/ops/batch_norm.cc (100%) rename lib/local-execution/src/{ => local-execution}/ops/cast.cc (100%) rename lib/local-execution/src/{ => local-execution}/ops/combine.cc (100%) rename lib/local-execution/src/{ => local-execution}/ops/concat.cc (100%) rename lib/local-execution/src/{ => local-execution}/ops/conv_2d.cc (100%) rename lib/local-execution/src/{ => local-execution}/ops/dropout.cc (100%) rename lib/local-execution/src/{ => local-execution}/ops/element_binary.cc (100%) rename lib/local-execution/src/{ => local-execution}/ops/element_unary.cc (100%) rename lib/local-execution/src/{ => local-execution}/ops/flat.cc (100%) rename lib/local-execution/src/{ => local-execution}/ops/gather.cc (100%) rename lib/local-execution/src/{ => local-execution}/ops/input.cc (100%) rename lib/local-execution/src/{ => local-execution}/ops/layer_norm.cc (100%) rename lib/local-execution/src/{ => local-execution}/ops/linear.cc (100%) rename lib/local-execution/src/{ => local-execution}/ops/noop.cc (100%) rename lib/local-execution/src/{ => local-execution}/ops/pool_2d.cc (100%) rename lib/local-execution/src/{ => local-execution}/ops/reduce.cc (100%) rename lib/local-execution/src/{ => local-execution}/ops/reduction.cc (100%) rename lib/local-execution/src/{ => local-execution}/ops/repartition.cc (100%) rename lib/local-execution/src/{ => local-execution}/ops/replicate.cc (100%) rename lib/local-execution/src/{ => local-execution}/ops/reshape.cc (100%) rename lib/local-execution/src/{ => local-execution}/ops/reverse.cc (100%) rename lib/local-execution/src/{ => local-execution}/ops/softmax.cc (100%) rename lib/local-execution/src/{ => local-execution}/ops/split.cc (100%) rename lib/local-execution/src/{ => local-execution}/ops/topk.cc (100%) rename lib/local-execution/src/{ => local-execution}/ops/transpose.cc (100%) rename lib/local-execution/src/{ => local-execution}/ops/weight.cc (100%) diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h index fd66697793..bc3ca34e6a 100644 --- a/lib/kernels/include/kernels/array_shape.h +++ b/lib/kernels/include/kernels/array_shape.h @@ -43,8 +43,6 @@ struct ArrayShape { std::optional at_maybe(legion_dim_t) const; std::optional at_maybe(ff_dim_t) const; - ArrayShape sub_shape(legion_dim_t start, ff_dim_t end) const; - ArrayShape sub_shape(std::optional start, std::optional end) const; diff --git a/lib/kernels/include/kernels/legion_dim.h b/lib/kernels/include/kernels/legion_dim.h index 29c5e29a93..e4dd9723b8 100644 --- a/lib/kernels/include/kernels/legion_dim.h +++ b/lib/kernels/include/kernels/legion_dim.h @@ -10,9 +10,6 @@ legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value); legion_dim_t legion_dim_from_ff_dim(ff_dim_t, int num_dimensions); -std::optional legion_dim_from_ff_dim(std::optional, - int num_dimensions); - template using LegionOrdered = DimOrdered; diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index 31ee7b6001..eb2b88b203 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -1,6 +1,7 @@ #include "kernels/array_shape.h" #include "op-attrs/dim_ordered/slice.h" #include "utils/containers/product.h" +#include "utils/containers/transform.h" namespace FlexFlow { @@ -54,17 +55,17 @@ std::size_t ArrayShape::at(ff_dim_t idx) const { return dims.at(legion_dim_from_ff_dim(idx, this->num_dims())); } -ArrayShape ArrayShape::sub_shape(legion_dim_t start, ff_dim_t end) const { - legion_dim_t legion_end = legion_dim_from_ff_dim(end, num_dims()); - return this->sub_shape(start, legion_end); -} - ArrayShape ArrayShape::sub_shape(std::optional start, std::optional end) const { - std::optional legion_start = - legion_dim_from_ff_dim(start, num_dims()); - std::optional legion_end = - legion_dim_from_ff_dim(end, num_dims()); + std::optional legion_start = transform( + start, [&](auto const &start_unwrapped) { + return legion_dim_from_ff_dim(start_unwrapped, num_dims()); + }); + + std::optional legion_end = transform( + end, [&](auto const &end_unwrapped) { + return legion_dim_from_ff_dim(end_unwrapped, num_dims()); + }); return this->sub_shape(legion_start, legion_end); } diff --git a/lib/kernels/src/cuda/ops/concat_kernels.cu b/lib/kernels/src/cuda/ops/concat_kernels.cu index 68004738d2..ce6178c7cc 100644 --- a/lib/kernels/src/cuda/ops/concat_kernels.cu +++ b/lib/kernels/src/cuda/ops/concat_kernels.cu @@ -15,6 +15,7 @@ #include "device.h" #include "kernels/concat_kernels.h" +#include "kernels/legion_dim.h" #include namespace FlexFlow { @@ -25,7 +26,8 @@ void calc_blk_size(size_t &num_blocks, size_t &blk_size, ArrayShape const &shape, ff_dim_t axis) { - blk_size = shape.sub_shape(legion_dim_t{0}, axis).num_elements(); + legion_dim_t axis_legion_dim = legion_dim_from_ff_dim(axis, shape.num_dims()); + blk_size = shape.sub_shape(legion_dim_t{0}, axis_legion_dim).num_elements(); num_blocks = shape.sub_shape(axis, std::nullopt).num_elements(); } diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/legion_dim.cc index c190a02220..9ef47d40ae 100644 --- a/lib/kernels/src/legion_dim.cc +++ b/lib/kernels/src/legion_dim.cc @@ -10,13 +10,4 @@ legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim, int num_dimensions) { return legion_dim_t(num_dimensions - ff_dim.value - 1); } -std::optional - legion_dim_from_ff_dim(std::optional ff_dim, int num_dimensions) { - if (ff_dim.has_value()) { - return legion_dim_from_ff_dim(ff_dim.value(), num_dimensions); - } else { - return std::nullopt; - } -} - } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/arg_ref.h b/lib/local-execution/include/local-execution/arg_ref.h index 30da405c13..75eecda273 100644 --- a/lib/local-execution/include/local-execution/arg_ref.h +++ b/lib/local-execution/include/local-execution/arg_ref.h @@ -82,7 +82,7 @@ template struct hash<::FlexFlow::ArgRefSpec> { size_t operator()(::FlexFlow::ArgRefSpec const &s) const { size_t result = 0; - ::FlexFlow::hash_combine(result, s.type_idx); + ::FlexFlow::hash_combine(result, s.type_idx, s.get_ref_type()); return result; } }; diff --git a/lib/local-execution/include/local-execution/concrete_arg.h b/lib/local-execution/include/local-execution/concrete_arg.h index ac5d97f3c4..cee52ba4a2 100644 --- a/lib/local-execution/include/local-execution/concrete_arg.h +++ b/lib/local-execution/include/local-execution/concrete_arg.h @@ -24,6 +24,10 @@ struct ConcreteArgSpec { return this->type_idx; } + std::shared_ptr get_ptr() const { + return this->ptr; + } + bool operator==(ConcreteArgSpec const &other) const; bool operator!=(ConcreteArgSpec const &other) const; @@ -60,7 +64,7 @@ template <> struct hash<::FlexFlow::ConcreteArgSpec> { size_t operator()(::FlexFlow::ConcreteArgSpec const &s) const { size_t result = 0; - ::FlexFlow::hash_combine(result, s.get_type_index()); + ::FlexFlow::hash_combine(result, s.get_type_index(), s.get_ptr()); return result; } }; diff --git a/lib/local-execution/include/local-execution/layer_tensor_key.struct.toml b/lib/local-execution/include/local-execution/layer_tensor_key.struct.toml index 3ec6d7b0f1..33a7a9174f 100644 --- a/lib/local-execution/include/local-execution/layer_tensor_key.struct.toml +++ b/lib/local-execution/include/local-execution/layer_tensor_key.struct.toml @@ -4,8 +4,6 @@ features = [ "eq", "ord", "hash", - "json", - "rapidcheck", "fmt", ] diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h index cbab4bf031..26ebfbe3c4 100644 --- a/lib/local-execution/include/local-execution/local_training_backing.h +++ b/lib/local-execution/include/local-execution/local_training_backing.h @@ -35,6 +35,7 @@ struct LocalTrainingBacking { std::optional const &) const; TaskArgumentAccessor get_op_task_arg_accessor(OpTaskInvocation const &, layer_guid_t const &) const; + LocalSlotsBacking local_slots_backing; private: DeviceSpecificDeviceStates call_init_task_impl(task_id_t, @@ -45,7 +46,6 @@ struct LocalTrainingBacking { Allocator allocator; ComputationGraph computation_graph; TaskRegistry task_registry; - LocalSlotsBacking local_slots_backing; }; } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/runtime_arg_ref.h b/lib/local-execution/include/local-execution/runtime_arg_ref.h index fd79e23126..a225a813df 100644 --- a/lib/local-execution/include/local-execution/runtime_arg_ref.h +++ b/lib/local-execution/include/local-execution/runtime_arg_ref.h @@ -27,18 +27,6 @@ RuntimeArgRef profiling_settings(); RuntimeArgRef> ff_handle(); RuntimeArgRef iteration_config(); -// std::string format_as(RuntimeArgRefSpec const & x) { -// std::ostringstream oss; -// oss << ""; -// return oss.str(); -// } - -// std::ostream &operator<<(std::ostream & s, RuntimeArgRefSpec const & x) { -// return (s << fmt::to_string(x)); -// } - } // namespace FlexFlow #endif diff --git a/lib/local-execution/include/local-execution/task_binding.h b/lib/local-execution/include/local-execution/task_binding.h index 93461e2e55..e211592ea6 100644 --- a/lib/local-execution/include/local-execution/task_binding.h +++ b/lib/local-execution/include/local-execution/task_binding.h @@ -7,7 +7,6 @@ #include "local-execution/task_arg_spec.dtg.h" #include "local-execution/task_id_t.dtg.h" #include "local-execution/task_signature.dtg.h" -#include "utils/hash/unordered_map.h" namespace FlexFlow { @@ -63,12 +62,7 @@ namespace std { template <> struct hash<::FlexFlow::TaskBinding> { - size_t operator()(::FlexFlow::TaskBinding const &s) const { - size_t result = 0; - hash_combine(result, s.get_tensor_bindings()); - hash_combine(result, s.get_arg_bindings()); - return result; - } + size_t operator()(::FlexFlow::TaskBinding const &s) const; }; } // namespace std diff --git a/lib/local-execution/include/local-execution/task_registry.h b/lib/local-execution/include/local-execution/task_registry.h index 24790a28e3..fa3e558337 100644 --- a/lib/local-execution/include/local-execution/task_registry.h +++ b/lib/local-execution/include/local-execution/task_registry.h @@ -14,7 +14,7 @@ void register_tasks_for_layer(TaskRegistry &, layer_guid_t const &, ComputationGraphOpAttrs const &attrs); -bool registry_contains_op_task(TaskRegistry const &, +bool registry_contains_task_for_layer(TaskRegistry const &, layer_guid_t const &, OpTaskType const &); diff --git a/lib/local-execution/src/ops/attention.cc b/lib/local-execution/src/local-execution/ops/attention.cc similarity index 100% rename from lib/local-execution/src/ops/attention.cc rename to lib/local-execution/src/local-execution/ops/attention.cc diff --git a/lib/local-execution/src/ops/batch_matmul.cc b/lib/local-execution/src/local-execution/ops/batch_matmul.cc similarity index 100% rename from lib/local-execution/src/ops/batch_matmul.cc rename to lib/local-execution/src/local-execution/ops/batch_matmul.cc diff --git a/lib/local-execution/src/ops/batch_norm.cc b/lib/local-execution/src/local-execution/ops/batch_norm.cc similarity index 100% rename from lib/local-execution/src/ops/batch_norm.cc rename to lib/local-execution/src/local-execution/ops/batch_norm.cc diff --git a/lib/local-execution/src/ops/cast.cc b/lib/local-execution/src/local-execution/ops/cast.cc similarity index 100% rename from lib/local-execution/src/ops/cast.cc rename to lib/local-execution/src/local-execution/ops/cast.cc diff --git a/lib/local-execution/src/ops/combine.cc b/lib/local-execution/src/local-execution/ops/combine.cc similarity index 100% rename from lib/local-execution/src/ops/combine.cc rename to lib/local-execution/src/local-execution/ops/combine.cc diff --git a/lib/local-execution/src/ops/concat.cc b/lib/local-execution/src/local-execution/ops/concat.cc similarity index 100% rename from lib/local-execution/src/ops/concat.cc rename to lib/local-execution/src/local-execution/ops/concat.cc diff --git a/lib/local-execution/src/ops/conv_2d.cc b/lib/local-execution/src/local-execution/ops/conv_2d.cc similarity index 100% rename from lib/local-execution/src/ops/conv_2d.cc rename to lib/local-execution/src/local-execution/ops/conv_2d.cc diff --git a/lib/local-execution/src/ops/dropout.cc b/lib/local-execution/src/local-execution/ops/dropout.cc similarity index 100% rename from lib/local-execution/src/ops/dropout.cc rename to lib/local-execution/src/local-execution/ops/dropout.cc diff --git a/lib/local-execution/src/ops/element_binary.cc b/lib/local-execution/src/local-execution/ops/element_binary.cc similarity index 100% rename from lib/local-execution/src/ops/element_binary.cc rename to lib/local-execution/src/local-execution/ops/element_binary.cc diff --git a/lib/local-execution/src/ops/element_unary.cc b/lib/local-execution/src/local-execution/ops/element_unary.cc similarity index 100% rename from lib/local-execution/src/ops/element_unary.cc rename to lib/local-execution/src/local-execution/ops/element_unary.cc diff --git a/lib/local-execution/src/ops/flat.cc b/lib/local-execution/src/local-execution/ops/flat.cc similarity index 100% rename from lib/local-execution/src/ops/flat.cc rename to lib/local-execution/src/local-execution/ops/flat.cc diff --git a/lib/local-execution/src/ops/gather.cc b/lib/local-execution/src/local-execution/ops/gather.cc similarity index 100% rename from lib/local-execution/src/ops/gather.cc rename to lib/local-execution/src/local-execution/ops/gather.cc diff --git a/lib/local-execution/src/ops/input.cc b/lib/local-execution/src/local-execution/ops/input.cc similarity index 100% rename from lib/local-execution/src/ops/input.cc rename to lib/local-execution/src/local-execution/ops/input.cc diff --git a/lib/local-execution/src/ops/layer_norm.cc b/lib/local-execution/src/local-execution/ops/layer_norm.cc similarity index 100% rename from lib/local-execution/src/ops/layer_norm.cc rename to lib/local-execution/src/local-execution/ops/layer_norm.cc diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/local-execution/ops/linear.cc similarity index 100% rename from lib/local-execution/src/ops/linear.cc rename to lib/local-execution/src/local-execution/ops/linear.cc diff --git a/lib/local-execution/src/ops/noop.cc b/lib/local-execution/src/local-execution/ops/noop.cc similarity index 100% rename from lib/local-execution/src/ops/noop.cc rename to lib/local-execution/src/local-execution/ops/noop.cc diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/local-execution/ops/pool_2d.cc similarity index 100% rename from lib/local-execution/src/ops/pool_2d.cc rename to lib/local-execution/src/local-execution/ops/pool_2d.cc diff --git a/lib/local-execution/src/ops/reduce.cc b/lib/local-execution/src/local-execution/ops/reduce.cc similarity index 100% rename from lib/local-execution/src/ops/reduce.cc rename to lib/local-execution/src/local-execution/ops/reduce.cc diff --git a/lib/local-execution/src/ops/reduction.cc b/lib/local-execution/src/local-execution/ops/reduction.cc similarity index 100% rename from lib/local-execution/src/ops/reduction.cc rename to lib/local-execution/src/local-execution/ops/reduction.cc diff --git a/lib/local-execution/src/ops/repartition.cc b/lib/local-execution/src/local-execution/ops/repartition.cc similarity index 100% rename from lib/local-execution/src/ops/repartition.cc rename to lib/local-execution/src/local-execution/ops/repartition.cc diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/local-execution/ops/replicate.cc similarity index 100% rename from lib/local-execution/src/ops/replicate.cc rename to lib/local-execution/src/local-execution/ops/replicate.cc diff --git a/lib/local-execution/src/ops/reshape.cc b/lib/local-execution/src/local-execution/ops/reshape.cc similarity index 100% rename from lib/local-execution/src/ops/reshape.cc rename to lib/local-execution/src/local-execution/ops/reshape.cc diff --git a/lib/local-execution/src/ops/reverse.cc b/lib/local-execution/src/local-execution/ops/reverse.cc similarity index 100% rename from lib/local-execution/src/ops/reverse.cc rename to lib/local-execution/src/local-execution/ops/reverse.cc diff --git a/lib/local-execution/src/ops/softmax.cc b/lib/local-execution/src/local-execution/ops/softmax.cc similarity index 100% rename from lib/local-execution/src/ops/softmax.cc rename to lib/local-execution/src/local-execution/ops/softmax.cc diff --git a/lib/local-execution/src/ops/split.cc b/lib/local-execution/src/local-execution/ops/split.cc similarity index 100% rename from lib/local-execution/src/ops/split.cc rename to lib/local-execution/src/local-execution/ops/split.cc diff --git a/lib/local-execution/src/ops/topk.cc b/lib/local-execution/src/local-execution/ops/topk.cc similarity index 100% rename from lib/local-execution/src/ops/topk.cc rename to lib/local-execution/src/local-execution/ops/topk.cc diff --git a/lib/local-execution/src/ops/transpose.cc b/lib/local-execution/src/local-execution/ops/transpose.cc similarity index 100% rename from lib/local-execution/src/ops/transpose.cc rename to lib/local-execution/src/local-execution/ops/transpose.cc diff --git a/lib/local-execution/src/ops/weight.cc b/lib/local-execution/src/local-execution/ops/weight.cc similarity index 100% rename from lib/local-execution/src/ops/weight.cc rename to lib/local-execution/src/local-execution/ops/weight.cc diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc index c99a2b154f..02265281b0 100644 --- a/lib/local-execution/src/local_cost_estimator.cc +++ b/lib/local-execution/src/local_cost_estimator.cc @@ -1,4 +1,5 @@ #include "local-execution/local_cost_estimator.h" +#include "local-execution/tensor_reduction.h" #include "kernels/device.h" #include "kernels/local_cuda_allocator.h" #include "local-execution/tracked_allocator.h" @@ -8,21 +9,11 @@ #include "pcg/computation_graph_builder.h" #include "pcg/parallel_tensor_attrs.h" #include "utils/containers/transform.h" +#include "utils/containers/values.h" +#include "utils/containers/sum.h" namespace FlexFlow { -static float get_total_elapsed_time(PerLayerElapsedTime const &fwd, - PerLayerElapsedTime const &bwd) { - float total_elapsed_time = 0; - for (auto const &layer_elapsed_time : fwd) { - layer_guid_t layer_id = layer_elapsed_time.first; - float fwd_time = layer_elapsed_time.second.value(); - float bwd_time = bwd.at(layer_id).value(); - total_elapsed_time += fwd_time + bwd_time; - } - return total_elapsed_time; -} - LocalCostEstimator::LocalCostEstimator(RuntimeArgConfig const &config) : runtime_arg_config(config) {} @@ -45,7 +36,6 @@ CostDetails LocalCostEstimator::estimate_cost( std::shared_ptr tracked_allocator_ptr = std::make_shared(create_local_cuda_memory_allocator()); Allocator allocator = Allocator(tracked_allocator_ptr); - TensorBackingMap tensor_backing_map; std::vector input_tensor_ids; ComputationGraphBuilder cg_builder; @@ -53,9 +43,6 @@ CostDetails LocalCostEstimator::estimate_cost( TensorShape tensor_shape = get_piece_shape(input); tensor_guid_t tensor_id = cg_builder.create_input(tensor_shape, CreateGrad::YES); - GenericTensorAccessorW tensor_backing = - allocator.allocate_tensor(tensor_shape); - tensor_backing_map.insert({tensor_id, tensor_backing}); input_tensor_ids.push_back(tensor_id); } @@ -79,7 +66,8 @@ CostDetails LocalCostEstimator::estimate_cost( LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, - tensor_backing_map, + LayerTensorBackingMap{}, + TensorBackingMap{}, this->runtime_arg_config); local_backing.register_and_allocate_layer(layer_added_result.layer); local_backing.execute_init(layer_added_result.layer); diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index e432b1afe9..f02a8c7824 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -68,7 +68,7 @@ std::optional } void LocalTrainingBacking::execute_init(layer_guid_t const &operator_node) { - if (registry_contains_op_task( + if (registry_contains_task_for_layer( this->task_registry, operator_node, OpTaskType::INIT)) { ComputationGraphOpAttrs attrs = get_layer_attrs(this->computation_graph, operator_node).attrs; @@ -85,7 +85,7 @@ void LocalTrainingBacking::execute_init(layer_guid_t const &operator_node) { std::optional LocalTrainingBacking::execute_forward(layer_guid_t const &operator_node) { - if (registry_contains_op_task( + if (registry_contains_task_for_layer( this->task_registry, operator_node, OpTaskType::FWD)) { ComputationGraphOpAttrs attrs = get_layer_attrs(this->computation_graph, operator_node).attrs; @@ -102,11 +102,10 @@ std::optional void LocalTrainingBacking::compute_loss(LossAttrs const &loss_attrs, reduced_tensor_t const &logit_tensor, reduced_tensor_t const &label_tensor) { - assert( - this->local_slots_backing.is_non_graph_tensor_allocated(logit_tensor) && - this->local_slots_backing.is_non_graph_tensor_allocated(label_tensor)); + assert(this->local_slots_backing.is_non_graph_tensor_allocated(label_tensor)); TaskInvocation loss_invocation = backward(loss_attrs, logit_tensor, label_tensor); + // TODO: https://github.com/flexflow/flexflow-train/issues/1442 // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation)); TaskArgumentAccessor loss_accessor = this->get_task_arg_accessor(loss_invocation, std::nullopt); @@ -116,7 +115,7 @@ void LocalTrainingBacking::compute_loss(LossAttrs const &loss_attrs, std::optional LocalTrainingBacking::execute_backward(layer_guid_t const &operator_node) { - if (registry_contains_op_task( + if (registry_contains_task_for_layer( this->task_registry, operator_node, OpTaskType::BWD)) { ComputationGraphOpAttrs attrs = get_layer_attrs(this->computation_graph, operator_node).attrs; @@ -143,6 +142,8 @@ void LocalTrainingBacking::execute_update( // get invocation TaskInvocation invocation = get_update_invocation( optimizer_attrs, weight_tensor, optimizer_buffer_tensors); + + // TODO: https://github.com/flexflow/flexflow-train/issues/1442 // assert(is_invocation_valid(get_update_signature(attrs), invocation)); // execute update diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc index 5a58e4c524..4815de5e85 100644 --- a/lib/local-execution/src/model_training_instance.cc +++ b/lib/local-execution/src/model_training_instance.cc @@ -71,7 +71,7 @@ void ModelTrainingInstance::execute_update() { this->training_backing.execute_update(node, this->optimizer_attrs); } this->optimizer_attrs = - get_next_iteration_optimizer_attrs(this->optimizer_attrs); + get_optimizer_attrs_for_next_iter(this->optimizer_attrs); } } // namespace FlexFlow diff --git a/lib/local-execution/src/task_binding.cc b/lib/local-execution/src/task_binding.cc index 5261eec217..2b1256df90 100644 --- a/lib/local-execution/src/task_binding.cc +++ b/lib/local-execution/src/task_binding.cc @@ -1,6 +1,7 @@ #include "local-execution/task_binding.h" #include "utils/containers/contains_key.h" #include "utils/fmt/unordered_map.h" +#include "utils/hash/unordered_map.h" namespace FlexFlow { @@ -58,3 +59,15 @@ std::ostream &operator<<(std::ostream &s, TaskBinding const &x) { } } // namespace FlexFlow + +namespace std { + +size_t hash<::FlexFlow::TaskBinding>::operator() ( + ::FlexFlow::TaskBinding const &s) const { + size_t result = 0; + hash_combine(result, s.get_tensor_bindings()); + hash_combine(result, s.get_arg_bindings()); + return result; + } + +} // namespace std diff --git a/lib/local-execution/src/task_registry.cc b/lib/local-execution/src/task_registry.cc index 3cd2cccae8..be1cf73e11 100644 --- a/lib/local-execution/src/task_registry.cc +++ b/lib/local-execution/src/task_registry.cc @@ -42,7 +42,7 @@ void register_tasks_for_layer(TaskRegistry &task_registry, } } -bool registry_contains_op_task(TaskRegistry const &task_registry, +bool registry_contains_task_for_layer(TaskRegistry const &task_registry, layer_guid_t const &op, OpTaskType const &op_task_type) { std::unordered_map> task_ids; diff --git a/lib/local-execution/src/task_signature.cc b/lib/local-execution/src/task_signature.cc index a608ab8ab8..1d57a1fc54 100644 --- a/lib/local-execution/src/task_signature.cc +++ b/lib/local-execution/src/task_signature.cc @@ -18,7 +18,7 @@ void add_slot(TaskSignature &task_signature, TensorType tensor_type, SlotType slot_type) { TensorTypeSlotSpec tensor_guid_slot_spec = - TensorTypeSlotSpec{slot_type, tensor_type}; + TensorTypeSlotSpec{name, tensor_type, slot_type}; task_signature.tensor_guid_slots.insert({name, tensor_guid_slot_spec}); } diff --git a/lib/local-execution/src/tensor_reduction.cc b/lib/local-execution/src/tensor_reduction.cc index 19324509bb..1d0cb7a2e9 100644 --- a/lib/local-execution/src/tensor_reduction.cc +++ b/lib/local-execution/src/tensor_reduction.cc @@ -4,7 +4,7 @@ namespace FlexFlow { reduced_tensor_t lower(tensor_guid_t const &tensor_guid) { - return reduced_tensor_t{tensor_guid.raw_graph_output.idx}; + return reduced_tensor_t{tensor_guid.raw_graph_output.node.raw_uid}; } std::vector diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc index 5d58e7e757..c9e95fe444 100644 --- a/lib/local-execution/test/src/test_local_slots_backing.cc +++ b/lib/local-execution/test/src/test_local_slots_backing.cc @@ -2,10 +2,12 @@ #include "local-execution/local_cost_estimator.h" #include "local-execution/local_cpu_allocator.h" #include "local-execution/local_slots_backing.h" +#include "local-execution/tensor_reduction.h" #include "op-attrs/ops/attention.h" #include "op-attrs/parallel_tensor_shape.h" #include "pcg/computation_graph.h" #include "pcg/computation_graph_builder.h" +#include "utils/containers/get_only.h" #include "test/utils/doctest/fmt/pair.h" #include "test/utils/doctest/fmt/unordered_map.h" #include "test/utils/doctest/fmt/variant.h" @@ -66,8 +68,12 @@ TEST_SUITE(FF_TEST_SUITE) { layer_guid_t layer_guid = get_layer_by_name(cg_builder.computation_graph, layer_name); - TensorBackingMap tensor_backing_map = { - {query_guid, query}, {key_guid, key}, {value_guid, value}}; + LayerTensorBackingMap layer_tensor_backing_map = { + {LayerTensorKey{layer_guid, lower(query_guid)}, query}, + {LayerTensorKey{layer_guid, lower(key_guid)}, key}, + {LayerTensorKey{layer_guid, lower(value_guid)}, value}, + //{LayerTensorKey{layer_guid, lower(output_guid), output}} + }; // runtime arg config ProfilingSettings settings = ProfilingSettings{/*warmup_iters=*/0, @@ -78,14 +84,15 @@ TEST_SUITE(FF_TEST_SUITE) { EnableProfiling::NO, settings}; - LocalSlotsBacking local_slots_backing = {tensor_backing_map, + LocalSlotsBacking local_slots_backing = {layer_tensor_backing_map, + TensorBackingMap{}, runtime_arg_config}; SUBCASE("LocalSlotsBacking::allocate_tensors_by_role") { auto get_result_shape_and_dtype_for_tensor_guid_and_map = - [&](tensor_guid_t t, - TensorBackingMap m) -> std::pair { - GenericTensorAccessorW accessor = m.at(t); + [&](tensor_guid_t t, layer_guid_t l, + LayerTensorBackingMap m) -> std::pair { + GenericTensorAccessorW accessor = m.at(LayerTensorKey{l, lower(t)}); return get_shape_and_datatype(accessor); }; @@ -101,7 +108,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("Query grad") { std::pair result = get_result_shape_and_dtype_for_tensor_guid_and_map( - query_guid, local_slots_backing.gradient_tensor_mapping); + query_guid, layer_guid, local_slots_backing.gradient_tensor_mapping); std::pair correct = {ArrayShape{query_shape}, dtype}; CHECK(result == correct); @@ -109,7 +116,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("Key grad") { std::pair result = get_result_shape_and_dtype_for_tensor_guid_and_map( - key_guid, local_slots_backing.gradient_tensor_mapping); + key_guid, layer_guid, local_slots_backing.gradient_tensor_mapping); std::pair correct = {ArrayShape{key_shape}, dtype}; CHECK(result == correct); @@ -117,7 +124,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("Value grad") { std::pair result = get_result_shape_and_dtype_for_tensor_guid_and_map( - value_guid, local_slots_backing.gradient_tensor_mapping); + value_guid, layer_guid, local_slots_backing.gradient_tensor_mapping); std::pair correct = {ArrayShape{value_shape}, dtype}; CHECK(result == correct); @@ -132,7 +139,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("Output") { std::pair result = get_result_shape_and_dtype_for_tensor_guid_and_map( - output_guid, local_slots_backing.tensor_mapping); + output_guid, layer_guid, local_slots_backing.tensor_mapping); std::pair correct = { ArrayShape{ get_tensor_attrs(cg_builder.computation_graph, output_guid) @@ -143,7 +150,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("Output grad") { std::pair result = get_result_shape_and_dtype_for_tensor_guid_and_map( - output_guid, local_slots_backing.gradient_tensor_mapping); + output_guid, layer_guid, local_slots_backing.gradient_tensor_mapping); std::pair correct = { ArrayShape{ get_tensor_attrs(cg_builder.computation_graph, output_guid) @@ -159,19 +166,19 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("Input tensor slots") { std::vector correct_incoming_input_tensors = get_incoming_inputs(cg_builder.computation_graph, layer_guid); - CHECK(correct_incoming_input_tensors == + CHECK(lower(correct_incoming_input_tensors) == local_slots_backing.input_tensor_slots.at(layer_guid)); } SUBCASE("Weight tensor slots") { std::vector correct_incoming_weight_tensors = get_incoming_weights(cg_builder.computation_graph, layer_guid); - CHECK(correct_incoming_weight_tensors == + CHECK(lower(correct_incoming_weight_tensors) == local_slots_backing.weight_tensor_slots.at(layer_guid)); } SUBCASE("Output tensor slots") { std::vector correct_outgoing_tensors = get_outgoing_tensors(cg_builder.computation_graph, layer_guid); - CHECK(correct_outgoing_tensors == + CHECK(lower(correct_outgoing_tensors) == local_slots_backing.output_tensor_slots.at(layer_guid)); } } @@ -231,12 +238,12 @@ TEST_SUITE(FF_TEST_SUITE) { allocator.allocate_tensor(output_attrs.shape); return get_slots_backing_without_tensor_allocation_addresses( TensorSlotsBacking{ - {SlotGradId{slot_id_t{QUERY}, IsGrad::NO}, query}, - {SlotGradId{slot_id_t{KEY}, IsGrad::NO}, key}, - {SlotGradId{slot_id_t{VALUE}, IsGrad::NO}, value}, - {SlotGradId{slot_id_t{WEIGHTS}, IsGrad::NO}, weights}, - {SlotGradId{slot_id_t{OUTPUT}, IsGrad::NO}, output}, - {SlotGradId{slot_id_t{QUERY}, IsGrad::YES}, query}}); + {SlotTensorTypeId{slot_id_t{QUERY}, TensorType::FORWARD}, query}, + {SlotTensorTypeId{slot_id_t{KEY}, TensorType::FORWARD}, key}, + {SlotTensorTypeId{slot_id_t{VALUE}, TensorType::FORWARD}, value}, + {SlotTensorTypeId{slot_id_t{WEIGHTS}, TensorType::FORWARD}, weights}, + {SlotTensorTypeId{slot_id_t{OUTPUT}, TensorType::FORWARD}, output}, + {SlotTensorTypeId{slot_id_t{QUERY}, TensorType::GRADIENT}, query}}); }(); CHECK(result == correct); diff --git a/lib/local-execution/test/src/test_local_task_arg_accessor.cc b/lib/local-execution/test/src/test_local_task_arg_accessor.cc index f52fccb1ed..bddda7acd1 100644 --- a/lib/local-execution/test/src/test_local_task_arg_accessor.cc +++ b/lib/local-execution/test/src/test_local_task_arg_accessor.cc @@ -37,68 +37,68 @@ TEST_SUITE(FF_TEST_SUITE) { }; TensorSlotsBacking tensor_slots_backing = { - {SlotGradId{slot_id_t{INPUT}, IsGrad::NO}, input}, - {SlotGradId{slot_id_t{INPUT}, IsGrad::YES}, input_grad}, - {SlotGradId{slot_id_t{VARIADIC_TENSORS}, IsGrad::NO}, variadic_tensors}, - {SlotGradId{slot_id_t{VARIADIC_TENSORS}, IsGrad::YES}, + {SlotTensorTypeId{slot_id_t{INPUT}, TensorType::FORWARD}, input}, + {SlotTensorTypeId{slot_id_t{INPUT}, TensorType::GRADIENT}, input_grad}, + {SlotTensorTypeId{slot_id_t{VARIADIC_TENSORS}, TensorType::FORWARD}, variadic_tensors}, + {SlotTensorTypeId{slot_id_t{VARIADIC_TENSORS}, TensorType::GRADIENT}, variadic_tensors_grad}, }; LocalTaskArgumentAccessor acc = {allocator, tensor_slots_backing, {}}; SUBCASE("get_tensor") { - SUBCASE("get_tensor(slot_id_t, Permissions::RO, IsGrad::NO)") { + SUBCASE("get_tensor(slot_id_t, Permissions::RO, TensorType::FORWARD)") { GenericTensorAccessor correct = GenericTensorAccessor{ read_only_accessor_from_write_accessor(input)}; GenericTensorAccessor result = - acc.get_tensor(slot_id_t{INPUT}, Permissions::RO, IsGrad::NO); + acc.get_tensor(slot_id_t{INPUT}, Permissions::RO, TensorType::FORWARD); CHECK(correct == result); } - SUBCASE("get_tensor(slot_id_t, Permissions::RO, IsGrad::YES)") { + SUBCASE("get_tensor(slot_id_t, Permissions::RO, TensorType::GRADIENT)") { GenericTensorAccessor correct = GenericTensorAccessor{ read_only_accessor_from_write_accessor(input_grad)}; GenericTensorAccessor result = - acc.get_tensor(slot_id_t{INPUT}, Permissions::RO, IsGrad::YES); + acc.get_tensor(slot_id_t{INPUT}, Permissions::RO, TensorType::GRADIENT); CHECK(correct == result); } - SUBCASE("get_tensor(slot_id_t, Permissions::WO, IsGrad::NO)") { + SUBCASE("get_tensor(slot_id_t, Permissions::WO, TensorType::FORWARD)") { GenericTensorAccessor correct = GenericTensorAccessor{input}; GenericTensorAccessor result = - acc.get_tensor(slot_id_t{INPUT}, Permissions::WO, IsGrad::NO); + acc.get_tensor(slot_id_t{INPUT}, Permissions::WO, TensorType::FORWARD); CHECK(correct == result); } - SUBCASE("get_tensor(slot_id_t, Permissions::WO, IsGrad::YES)") { + SUBCASE("get_tensor(slot_id_t, Permissions::WO, TensorType::GRADIENT)") { GenericTensorAccessor correct = GenericTensorAccessor{input_grad}; GenericTensorAccessor result = - acc.get_tensor(slot_id_t{INPUT}, Permissions::WO, IsGrad::YES); + acc.get_tensor(slot_id_t{INPUT}, Permissions::WO, TensorType::GRADIENT); CHECK(correct == result); } - SUBCASE("get_tensor(slot_id_t, Permissions::RW, IsGrad::NO)") { + SUBCASE("get_tensor(slot_id_t, Permissions::RW, TensorType::FORWARD)") { GenericTensorAccessor correct = GenericTensorAccessor{input}; GenericTensorAccessor result = - acc.get_tensor(slot_id_t{INPUT}, Permissions::RW, IsGrad::NO); + acc.get_tensor(slot_id_t{INPUT}, Permissions::RW, TensorType::FORWARD); CHECK(correct == result); } - SUBCASE("get_tensor(slot_id_t, Permissions::RW, IsGrad::YES)") { + SUBCASE("get_tensor(slot_id_t, Permissions::RW, TensorType::GRADIENT)") { GenericTensorAccessor correct = GenericTensorAccessor{input_grad}; GenericTensorAccessor result = - acc.get_tensor(slot_id_t{INPUT}, Permissions::RW, IsGrad::YES); + acc.get_tensor(slot_id_t{INPUT}, Permissions::RW, TensorType::GRADIENT); CHECK(correct == result); } } SUBCASE("get_variadic_tensor") { - SUBCASE("get_variadic_tensor(slot_id_t, Permissions::RO, IsGrad::NO)") { + SUBCASE("get_variadic_tensor(slot_id_t, Permissions::RO, TensorType::FORWARD)") { VariadicGenericTensorAccessor correct = VariadicGenericTensorAccessor{std::vector{ read_only_accessor_from_write_accessor(variadic_tensors.at(0)), read_only_accessor_from_write_accessor( variadic_tensors.at(1))}}; VariadicGenericTensorAccessor result = acc.get_variadic_tensor( - slot_id_t{VARIADIC_TENSORS}, Permissions::RO, IsGrad::NO); + slot_id_t{VARIADIC_TENSORS}, Permissions::RO, TensorType::FORWARD); CHECK(result == correct); } - SUBCASE("get_variadic_tensor(slot_id_t, Permissions::RO, IsGrad::YES)") { + SUBCASE("get_variadic_tensor(slot_id_t, Permissions::RO, TensorType::GRADIENT)") { VariadicGenericTensorAccessor correct = VariadicGenericTensorAccessor{std::vector{ read_only_accessor_from_write_accessor( @@ -106,35 +106,35 @@ TEST_SUITE(FF_TEST_SUITE) { read_only_accessor_from_write_accessor( variadic_tensors_grad.at(1))}}; VariadicGenericTensorAccessor result = acc.get_variadic_tensor( - slot_id_t{VARIADIC_TENSORS}, Permissions::RO, IsGrad::YES); + slot_id_t{VARIADIC_TENSORS}, Permissions::RO, TensorType::GRADIENT); CHECK(result == correct); } - SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, IsGrad::NO)") { + SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, TensorType::FORWARD)") { VariadicGenericTensorAccessor correct = VariadicGenericTensorAccessor{variadic_tensors}; VariadicGenericTensorAccessor result = acc.get_variadic_tensor( - slot_id_t{VARIADIC_TENSORS}, Permissions::WO, IsGrad::NO); + slot_id_t{VARIADIC_TENSORS}, Permissions::WO, TensorType::FORWARD); CHECK(result == correct); } - SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, IsGrad::YES)") { + SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, TensorType::GRADIENT)") { VariadicGenericTensorAccessor correct = VariadicGenericTensorAccessor{variadic_tensors_grad}; VariadicGenericTensorAccessor result = acc.get_variadic_tensor( - slot_id_t{VARIADIC_TENSORS}, Permissions::WO, IsGrad::YES); + slot_id_t{VARIADIC_TENSORS}, Permissions::WO, TensorType::GRADIENT); CHECK(result == correct); } - SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, IsGrad::NO)") { + SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, TensorType::FORWARD)") { VariadicGenericTensorAccessor correct = VariadicGenericTensorAccessor{variadic_tensors}; VariadicGenericTensorAccessor result = acc.get_variadic_tensor( - slot_id_t{VARIADIC_TENSORS}, Permissions::RW, IsGrad::NO); + slot_id_t{VARIADIC_TENSORS}, Permissions::RW, TensorType::FORWARD); CHECK(result == correct); } - SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, IsGrad::YES)") { + SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, TensorType::GRADIENT)") { VariadicGenericTensorAccessor correct = VariadicGenericTensorAccessor{variadic_tensors_grad}; VariadicGenericTensorAccessor result = acc.get_variadic_tensor( - slot_id_t{VARIADIC_TENSORS}, Permissions::RW, IsGrad::YES); + slot_id_t{VARIADIC_TENSORS}, Permissions::RW, TensorType::GRADIENT); CHECK(result == correct); } } diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc index c4662d624c..5793d02f31 100644 --- a/lib/local-execution/test/src/test_loss_e2e.cc +++ b/lib/local-execution/test/src/test_loss_e2e.cc @@ -1,4 +1,5 @@ #include "doctest/doctest.h" +#include "local-execution/tensor_reduction.h" #include "kernels/local_cuda_allocator.h" #include "kernels/managed_ff_stream.h" #include "kernels/managed_per_device_ff_handle.h" @@ -35,61 +36,53 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { std::string layer_name = "scalar multiply"; tensor_guid_t logit_tensor = cg_builder.scalar_multiply(input_tensor, scalar, layer_name); + layer_guid_t layer_guid = get_layer_by_name(cg_builder.computation_graph, layer_name); // allocate memory Allocator allocator = create_local_cuda_memory_allocator(); - TensorBackingMap tensor_backing_map; - GenericTensorAccessorW input_backing = - allocator.allocate_tensor(input_shape); - tensor_backing_map.insert({input_tensor, input_backing}); LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, - tensor_backing_map, + LayerTensorBackingMap{}, + TensorBackingMap{}, runtime_arg_config); - // for (layer_guid_t const & node: - // topological_ordering(cg_builder.computation_graph)) { - // local_backing.register_and_allocate_layer(node); - // } - local_backing.register_and_allocate_layer( - get_layer_by_name(cg_builder.computation_graph, layer_name)); + + local_backing.register_and_allocate_layer(layer_guid); SUBCASE("SparseCategoricalCrossEntropyLossAttrs") { TensorShape label_shape = TensorShape{ TensorDims{FFOrdered{batch_size, 1}}, DataType::FLOAT}; - tensor_guid_t label_tensor = - cg_builder.create_input(label_shape, CreateGrad::NO); + reduced_tensor_t label_tensor = reduced_tensor_t{-1}; GenericTensorAccessorW label_backing = allocator.allocate_tensor(label_shape); - local_backing.insert_tensor(label_tensor, label_backing); + local_backing.local_slots_backing.non_graph_tensor_mapping.insert({label_tensor, label_backing}); LossAttrs loss_attrs = LossAttrs{ SparseCategoricalCrossEntropyLossAttrs{/*replace_labels=*/false}}; - local_backing.compute_loss(loss_attrs, logit_tensor, label_tensor); + local_backing.compute_loss(loss_attrs, lower(logit_tensor), label_tensor); } SUBCASE("NonconfigurableLossAttrs") { - tensor_guid_t label_tensor = - cg_builder.create_input(input_shape, CreateGrad::NO); + reduced_tensor_t label_tensor = reduced_tensor_t{-1}; GenericTensorAccessorW label_backing = allocator.allocate_tensor(input_shape); - local_backing.insert_tensor(label_tensor, label_backing); + local_backing.local_slots_backing.non_graph_tensor_mapping.insert({label_tensor, label_backing}); SUBCASE("LossFunction::CATEGORICAL_CROSSENTROPY") { LossAttrs loss_attrs = LossAttrs{ NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}; - local_backing.compute_loss(loss_attrs, logit_tensor, label_tensor); + local_backing.compute_loss(loss_attrs, lower(logit_tensor), label_tensor); } SUBCASE("LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE") { LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{ LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}; - local_backing.compute_loss(loss_attrs, logit_tensor, label_tensor); + local_backing.compute_loss(loss_attrs, lower(logit_tensor), label_tensor); } SUBCASE("LossFunction::IDENTITY") { LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{LossFunction::IDENTITY}}; - local_backing.compute_loss(loss_attrs, logit_tensor, label_tensor); + local_backing.compute_loss(loss_attrs, lower(logit_tensor), label_tensor); } } } diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc index b48214d89d..2e5e386a95 100644 --- a/lib/local-execution/test/src/test_update_e2e.cc +++ b/lib/local-execution/test/src/test_update_e2e.cc @@ -3,6 +3,7 @@ #include "kernels/managed_ff_stream.h" #include "kernels/managed_per_device_ff_handle.h" #include "local-execution/local_training_backing.h" +#include "local-execution/tensor_reduction.h" #include "pcg/computation_graph.h" #include "pcg/computation_graph_builder.h" #include "pcg/optimizer_attrs.dtg.h" @@ -37,14 +38,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { // allocate memory Allocator allocator = create_local_cuda_memory_allocator(); - TensorBackingMap tensor_backing_map; - GenericTensorAccessorW input_backing = - allocator.allocate_tensor(input_shape); - tensor_backing_map.insert({input_tensor, input_backing}); - LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, - tensor_backing_map, + LayerTensorBackingMap{}, + TensorBackingMap{}, runtime_arg_config); // for (layer_guid_t const & node: // topological_ordering(cg_builder.computation_graph)) { diff --git a/lib/op-attrs/include/op-attrs/operator_attrs.h b/lib/op-attrs/include/op-attrs/operator_attrs.h index 73473d6ac5..483e735196 100644 --- a/lib/op-attrs/include/op-attrs/operator_attrs.h +++ b/lib/op-attrs/include/op-attrs/operator_attrs.h @@ -1,35 +1,35 @@ #ifndef _OPERATOR_PARAMS_H #define _OPERATOR_PARAMS_H -#include "local-execution/ops/attention.h" -#include "local-execution/ops/batch_matmul.h" -#include "local-execution/ops/batch_norm.h" -#include "local-execution/ops/broadcast.h" -#include "local-execution/ops/cast.h" -#include "local-execution/ops/combine.h" -#include "local-execution/ops/concat.h" -#include "local-execution/ops/conv_2d.h" -#include "local-execution/ops/dropout.h" -#include "local-execution/ops/element_binary.h" -#include "local-execution/ops/element_unary.h" -#include "local-execution/ops/embedding.h" -#include "local-execution/ops/flat.h" -#include "local-execution/ops/gather.h" -#include "local-execution/ops/input.h" -#include "local-execution/ops/layer_norm.h" -#include "local-execution/ops/linear.h" -#include "local-execution/ops/noop.h" -#include "local-execution/ops/pool_2d.h" -#include "local-execution/ops/reduce.h" -#include "local-execution/ops/reduction.h" -#include "local-execution/ops/repartition.h" -#include "local-execution/ops/replicate.h" -#include "local-execution/ops/reshape.h" -#include "local-execution/ops/reverse.h" -#include "local-execution/ops/softmax.h" -#include "local-execution/ops/split.h" -#include "local-execution/ops/topk.h" -#include "local-execution/ops/transpose.h" +#include "op-attrs/ops/attention.h" +#include "op-attrs/ops/batch_matmul.h" +#include "op-attrs/ops/batch_norm.h" +#include "op-attrs/ops/broadcast.h" +#include "op-attrs/ops/cast.h" +#include "op-attrs/ops/combine.h" +#include "op-attrs/ops/concat.h" +#include "op-attrs/ops/conv_2d.h" +#include "op-attrs/ops/dropout.h" +#include "op-attrs/ops/element_binary.h" +#include "op-attrs/ops/element_unary.h" +#include "op-attrs/ops/embedding.h" +#include "op-attrs/ops/flat.h" +#include "op-attrs/ops/gather.h" +#include "op-attrs/ops/input.h" +#include "op-attrs/ops/layer_norm.h" +#include "op-attrs/ops/linear.h" +#include "op-attrs/ops/noop.h" +#include "op-attrs/ops/pool_2d.h" +#include "op-attrs/ops/reduce.h" +#include "op-attrs/ops/reduction.h" +#include "op-attrs/ops/repartition.h" +#include "op-attrs/ops/replicate.h" +#include "op-attrs/ops/reshape.h" +#include "op-attrs/ops/reverse.h" +#include "op-attrs/ops/softmax.h" +#include "op-attrs/ops/split.h" +#include "op-attrs/ops/topk.h" +#include "op-attrs/ops/transpose.h" #include "op-attrs/ops/core.h" #include "op-attrs/pcg_operator_attrs.dtg.h" #include "utils/record_formatter.h" diff --git a/lib/pcg/include/pcg/optimizer_attrs.h b/lib/pcg/include/pcg/optimizer_attrs.h index d4abd1b52f..1d74694c29 100644 --- a/lib/pcg/include/pcg/optimizer_attrs.h +++ b/lib/pcg/include/pcg/optimizer_attrs.h @@ -6,7 +6,7 @@ namespace FlexFlow { -OptimizerAttrs get_next_iteration_optimizer_attrs(OptimizerAttrs const &old); +OptimizerAttrs get_optimizer_attrs_for_next_iter(OptimizerAttrs const &old); } // namespace FlexFlow diff --git a/lib/pcg/src/pcg/optimizer_attrs.cc b/lib/pcg/src/pcg/optimizer_attrs.cc index 8d66f7af7e..ce2d3d0db7 100644 --- a/lib/pcg/src/pcg/optimizer_attrs.cc +++ b/lib/pcg/src/pcg/optimizer_attrs.cc @@ -3,7 +3,7 @@ namespace FlexFlow { OptimizerAttrs - get_next_iteration_optimizer_attrs(OptimizerAttrs const &old_attrs) { + get_optimizer_attrs_for_next_iter(OptimizerAttrs const &old_attrs) { if (old_attrs.has()) { AdamOptimizerAttrs old = old_attrs.get(); double new_beta1_t = old.beta_t * old.beta1; From 9d252b37ef49d92cd358346472a9a94589ad7f81 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 15 Jan 2025 14:32:29 -0800 Subject: [PATCH 24/91] Remove tensor lower --- .../local-execution/tensor_reduction.h | 2 -- .../src/local_slots_backing.cc | 18 +++++++++++--- lib/local-execution/src/tensor_reduction.cc | 7 ------ .../test/src/test_local_slots_backing.cc | 24 ++++++++++++------- 4 files changed, 30 insertions(+), 21 deletions(-) diff --git a/lib/local-execution/include/local-execution/tensor_reduction.h b/lib/local-execution/include/local-execution/tensor_reduction.h index eb55b07ee4..2cb0b12ff0 100644 --- a/lib/local-execution/include/local-execution/tensor_reduction.h +++ b/lib/local-execution/include/local-execution/tensor_reduction.h @@ -8,8 +8,6 @@ namespace FlexFlow { reduced_tensor_t lower(tensor_guid_t const &); -std::vector lower(std::vector const &); - } // namespace FlexFlow #endif diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc index f1bb5a9a5b..8a277adc78 100644 --- a/lib/local-execution/src/local_slots_backing.cc +++ b/lib/local-execution/src/local_slots_backing.cc @@ -43,15 +43,27 @@ void LocalSlotsBacking::allocate_tensors_by_role( switch (role) { case TensorRole::INPUT: tensors = get_incoming_inputs(computation_graph, layer_guid); - this->input_tensor_slots.insert({layer_guid, lower(tensors)}); + this->input_tensor_slots.insert({layer_guid, + transform(tensors, [&](tensor_guid_t const &tensor_guid) { + return lower(tensor_guid); + }) + }); break; case TensorRole::WEIGHT: tensors = get_incoming_weights(computation_graph, layer_guid); - this->weight_tensor_slots.insert({layer_guid, lower(tensors)}); + this->weight_tensor_slots.insert({layer_guid, + transform(tensors, [&](tensor_guid_t const &tensor_guid) { + return lower(tensor_guid); + }) + }); break; case TensorRole::OUTPUT: tensors = get_outgoing_tensors(computation_graph, layer_guid); - this->output_tensor_slots.insert({layer_guid, lower(tensors)}); + this->output_tensor_slots.insert({layer_guid, + transform(tensors, [&](tensor_guid_t const &tensor_guid) { + return lower(tensor_guid); + }) + }); break; default: throw mk_runtime_error("Invalid tensor role, got {}", role); diff --git a/lib/local-execution/src/tensor_reduction.cc b/lib/local-execution/src/tensor_reduction.cc index 1d0cb7a2e9..ae5b188dfd 100644 --- a/lib/local-execution/src/tensor_reduction.cc +++ b/lib/local-execution/src/tensor_reduction.cc @@ -7,11 +7,4 @@ reduced_tensor_t lower(tensor_guid_t const &tensor_guid) { return reduced_tensor_t{tensor_guid.raw_graph_output.node.raw_uid}; } -std::vector - lower(std::vector const &tensor_guids) { - return transform(tensor_guids, [&](tensor_guid_t const &tensor_guid) { - return lower(tensor_guid); - }); -} - } // namespace FlexFlow diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc index c9e95fe444..88dfa34783 100644 --- a/lib/local-execution/test/src/test_local_slots_backing.cc +++ b/lib/local-execution/test/src/test_local_slots_backing.cc @@ -164,21 +164,27 @@ TEST_SUITE(FF_TEST_SUITE) { local_slots_backing.allocate_layer_tensors( layer_guid, cg_builder.computation_graph, allocator); SUBCASE("Input tensor slots") { - std::vector correct_incoming_input_tensors = - get_incoming_inputs(cg_builder.computation_graph, layer_guid); - CHECK(lower(correct_incoming_input_tensors) == + std::vector correct_incoming_input_tensors = + transform(get_incoming_inputs(cg_builder.computation_graph, layer_guid), [&](tensor_guid_t const &tensor_guid) { + return lower(tensor_guid); + }); + CHECK(correct_incoming_input_tensors == local_slots_backing.input_tensor_slots.at(layer_guid)); } SUBCASE("Weight tensor slots") { - std::vector correct_incoming_weight_tensors = - get_incoming_weights(cg_builder.computation_graph, layer_guid); - CHECK(lower(correct_incoming_weight_tensors) == + std::vector correct_incoming_weight_tensors = + transform(get_incoming_weights(cg_builder.computation_graph, layer_guid), [&](tensor_guid_t const &tensor_guid) { + return lower(tensor_guid); + }); + CHECK(correct_incoming_weight_tensors == local_slots_backing.weight_tensor_slots.at(layer_guid)); } SUBCASE("Output tensor slots") { - std::vector correct_outgoing_tensors = - get_outgoing_tensors(cg_builder.computation_graph, layer_guid); - CHECK(lower(correct_outgoing_tensors) == + std::vector correct_output_tensors = + transform(get_outgoing_tensors(cg_builder.computation_graph, layer_guid), [&](tensor_guid_t const &tensor_guid) { + return lower(tensor_guid); + }); + CHECK(correct_output_tensors == local_slots_backing.output_tensor_slots.at(layer_guid)); } } From 895c117100a0ac4cdb1cc1dead37f2efbe3786f9 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Thu, 16 Jan 2025 17:04:17 -0800 Subject: [PATCH 25/91] Add tensor and task lowering scheme --- .../layer_tensor_key.struct.toml | 21 -- .../local-execution/local_args_backing.h | 37 +++ .../local-execution/local_slots_backing.h | 89 ------ .../local-execution/local_tensor_backing.h | 58 ++++ .../local-execution/local_training_backing.h | 25 +- .../include/local-execution/loss_functions.h | 4 +- ....struct.toml => loss_tensor_t.struct.toml} | 2 +- .../local-execution/lowered_tensor_source.h | 21 ++ ...ruct.toml => lowered_tensor_t.struct.toml} | 6 +- .../local-execution/model_training_instance.h | 12 +- .../local-execution/op_task_invocation.h | 6 +- .../op_tensor_slot_spec.struct.toml | 6 +- .../include/local-execution/optimizer.h | 15 +- .../local-execution/optimizer_tensor_source.h | 21 ++ .../optimizer_tensor_t.struct.toml | 13 + .../local-execution/slot_grad_id.struct.toml | 21 ++ .../include/local-execution/task_binding.h | 24 +- .../{tensor_reduction.h => tensor_lowering.h} | 4 +- .../local-execution/tensor_type.enum.toml | 2 +- .../tensor_type_t.variant.toml | 26 ++ lib/local-execution/src/local_args_backing.cc | 62 ++++ .../src/local_cost_estimator.cc | 2 +- .../src/local_slots_backing.cc | 270 ------------------ .../src/local_tensor_backing.cc | 123 ++++++++ .../src/local_training_backing.cc | 109 ++++--- lib/local-execution/src/loss_functions.cc | 14 +- .../src/lowered_tensor_source.cc | 13 + .../src/model_training_instance.cc | 8 +- lib/local-execution/src/op_task_invocation.cc | 12 +- lib/local-execution/src/op_task_signature.cc | 18 +- lib/local-execution/src/optimizer.cc | 28 +- .../src/optimizer_tensor_source.cc | 13 + lib/local-execution/src/task_binding.cc | 45 ++- lib/local-execution/src/tensor_lowering.cc | 10 + lib/local-execution/src/tensor_reduction.cc | 10 - 35 files changed, 624 insertions(+), 526 deletions(-) delete mode 100644 lib/local-execution/include/local-execution/layer_tensor_key.struct.toml create mode 100644 lib/local-execution/include/local-execution/local_args_backing.h delete mode 100644 lib/local-execution/include/local-execution/local_slots_backing.h create mode 100644 lib/local-execution/include/local-execution/local_tensor_backing.h rename lib/local-execution/include/local-execution/{reduced_tensor_t.struct.toml => loss_tensor_t.struct.toml} (82%) create mode 100644 lib/local-execution/include/local-execution/lowered_tensor_source.h rename lib/local-execution/include/local-execution/{non_graph_tensor_guid_t.struct.toml => lowered_tensor_t.struct.toml} (62%) create mode 100644 lib/local-execution/include/local-execution/optimizer_tensor_source.h create mode 100644 lib/local-execution/include/local-execution/optimizer_tensor_t.struct.toml create mode 100644 lib/local-execution/include/local-execution/slot_grad_id.struct.toml rename lib/local-execution/include/local-execution/{tensor_reduction.h => tensor_lowering.h} (67%) create mode 100644 lib/local-execution/include/local-execution/tensor_type_t.variant.toml create mode 100644 lib/local-execution/src/local_args_backing.cc delete mode 100644 lib/local-execution/src/local_slots_backing.cc create mode 100644 lib/local-execution/src/local_tensor_backing.cc create mode 100644 lib/local-execution/src/lowered_tensor_source.cc create mode 100644 lib/local-execution/src/optimizer_tensor_source.cc create mode 100644 lib/local-execution/src/tensor_lowering.cc delete mode 100644 lib/local-execution/src/tensor_reduction.cc diff --git a/lib/local-execution/include/local-execution/layer_tensor_key.struct.toml b/lib/local-execution/include/local-execution/layer_tensor_key.struct.toml deleted file mode 100644 index 33a7a9174f..0000000000 --- a/lib/local-execution/include/local-execution/layer_tensor_key.struct.toml +++ /dev/null @@ -1,21 +0,0 @@ -namespace = "FlexFlow" -name = "LayerTensorKey" -features = [ - "eq", - "ord", - "hash", - "fmt", -] - -includes = [ - "pcg/layer_guid_t.dtg.h", - "local-execution/reduced_tensor_t.dtg.h" -] - -[[fields]] -name = "layer_guid" -type = "::FlexFlow::layer_guid_t" - -[[fields]] -name = "reduced_tensor" -type = "::FlexFlow::reduced_tensor_t" diff --git a/lib/local-execution/include/local-execution/local_args_backing.h b/lib/local-execution/include/local-execution/local_args_backing.h new file mode 100644 index 0000000000..d497c49738 --- /dev/null +++ b/lib/local-execution/include/local-execution/local_args_backing.h @@ -0,0 +1,37 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_ARGS_BACKING_H +#define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_ARGS_BACKING_H + +#include "pcg/layer_guid_t.dtg.h" +#include "pcg/computation_graph.h" +#include "local-execution/per_device_op_state.h" +#include "local-execution/op_task_invocation.h" +#include "local-execution/runtime_arg_config.h" +#include "local-execution/task_invocation.dtg.h" +#include "local-execution/local_task_argument_accessor.h" + +namespace FlexFlow { + +struct LocalArgsBacking { + LocalArgsBacking(RuntimeArgConfig const &); + +public: + void add_per_device_op_state(layer_guid_t const &, + DeviceSpecificDeviceStates const &); + + ArgSlotsBacking construct_arg_slots_backing(TaskBinding const &) const; + + ConcreteArgSpec lower_to_concrete_arg_spec(RuntimeArgRefSpec const &) const; + ConcreteArgSpec lower_to_concrete_arg_spec(OpArgRefSpec const &, + ComputationGraph const &, + layer_guid_t const &) const; + +public: + // arguments + std::unordered_map + per_device_op_states; + RuntimeArgConfig runtime_arg_config; +}; + +} + +#endif diff --git a/lib/local-execution/include/local-execution/local_slots_backing.h b/lib/local-execution/include/local-execution/local_slots_backing.h deleted file mode 100644 index a632f432cf..0000000000 --- a/lib/local-execution/include/local-execution/local_slots_backing.h +++ /dev/null @@ -1,89 +0,0 @@ - -#ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_SLOTS_BACKING_H -#define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_SLOTS_BACKING_H - -#include "kernels/accessor.h" -#include "local-execution/layer_tensor_key.dtg.h" -#include "local-execution/local_task_argument_accessor.h" -#include "local-execution/non_graph_tensor_guid_t.dtg.h" -#include "local-execution/op_task_invocation.h" -#include "local-execution/per_device_op_state.h" -#include "local-execution/runtime_arg_config.h" -#include "local-execution/task_invocation.dtg.h" -#include "local-execution/tensor_role.dtg.h" -#include "pcg/computation_graph.dtg.h" -#include "pcg/tensor_guid_t.dtg.h" - -namespace FlexFlow { - -using LayerTensorBackingMap = - std::unordered_map; - -using TensorBackingMap = - std::unordered_map; - -struct LocalSlotsBacking { - LocalSlotsBacking(LayerTensorBackingMap const &allocated_forward_tensors, - TensorBackingMap const &allocated_non_graph_tensors, - RuntimeArgConfig const &); - -public: - void add_per_device_op_state(layer_guid_t const &, - DeviceSpecificDeviceStates const &); - void allocate_layer_tensors(layer_guid_t const &, - ComputationGraph const &, - Allocator &); - void allocate_tensors_by_role(TensorRole const &, - layer_guid_t const &, - ComputationGraph const &, - Allocator &); - void allocate_optimizer_tensors(layer_guid_t const &weight_layer, - tensor_guid_t const &, - ComputationGraph const &, - Allocator &, - TaskSignature const &); - TensorSlotsBacking construct_tensor_slots_backing(OpTaskBinding const &, - layer_guid_t const &) const; - TensorSlotsBacking - construct_tensor_slots_backing(TaskBinding const &, - std::optional const &) const; - ArgSlotsBacking construct_arg_slots_backing(OpTaskBinding const &, - layer_guid_t const &) const; - ArgSlotsBacking construct_arg_slots_backing(TaskBinding const &) const; - - ConcreteArgSpec resolve_runtime_arg_ref_spec(RuntimeArgRefSpec const &) const; - ConcreteArgSpec resolve_op_arg_ref_spec(OpArgRefSpec const &, - layer_guid_t const &) const; - - GenericTensorAccessorW const & - get_tensor_backing(TensorType const &, - reduced_tensor_t const &, - std::optional const &) const; - - bool is_forward_tensor_allocated(LayerTensorKey const &) const; - bool is_non_graph_tensor_allocated(reduced_tensor_t const &) const; - -public: - // tensors - LayerTensorBackingMap tensor_mapping; - LayerTensorBackingMap gradient_tensor_mapping; - LayerTensorBackingMap optimizer_tensor_mapping; - TensorBackingMap non_graph_tensor_mapping; - std::unordered_map> - input_tensor_slots; - std::unordered_map> - weight_tensor_slots; - std::unordered_map> - output_tensor_slots; - std::unordered_map> - weight_optimizer_tensor_guids; - - // arguments - std::unordered_map - per_device_op_states; - RuntimeArgConfig runtime_arg_config; -}; - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/local_tensor_backing.h b/lib/local-execution/include/local-execution/local_tensor_backing.h new file mode 100644 index 0000000000..68a38253f8 --- /dev/null +++ b/lib/local-execution/include/local-execution/local_tensor_backing.h @@ -0,0 +1,58 @@ + +#ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TENSOR_BACKING_H +#define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TENSOR_BACKING_H + +#include "kernels/accessor.h" +#include "local-execution/local_task_argument_accessor.h" +#include "local-execution/task_invocation.dtg.h" +#include "local-execution/tensor_role.dtg.h" +#include "local-execution/lowered_tensor_t.dtg.h" +#include "local-execution/lowered_tensor_source.h" +#include "local-execution/optimizer_tensor_t.dtg.h" +#include "local-execution/loss_tensor_t.dtg.h" +#include "pcg/computation_graph.dtg.h" +#include "pcg/tensor_guid_t.dtg.h" +#include "pcg/layer_guid_t.dtg.h" + +namespace FlexFlow { + +using TensorBackingMap = + std::unordered_map; + +struct LocalTensorBacking { + LocalTensorBacking(); + +public: + void allocate_layer_tensors(layer_guid_t const &, + ComputationGraph const &, + Allocator &); + void allocate_tensors_by_role(TensorRole const &, + layer_guid_t const &, + ComputationGraph const &, + Allocator &); + void allocate_optimizer_tensors(tensor_guid_t const &, + std::vector const &, + Allocator &); + TensorSlotsBacking + construct_tensor_slots_backing(TaskBinding const &) const; + + GenericTensorAccessorW const & + get_tensor_backing(lowered_tensor_t const &) const; + + bool is_tensor_allocated(lowered_tensor_t const &) const; + +public: + // tensors + TensorBackingMap tensor_backings; + + std::unordered_map tensor_lowering_mapping; + std::unordered_map gradient_tensor_lowering_mapping; + std::unordered_map optimizer_tensor_lowering_mapping; + std::unordered_map loss_tensor_lowering_mapping; + + LoweredTensorSource lowered_tensor_source; +}; + +} // namespace FlexFlow + +#endif diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h index 26ebfbe3c4..a915f3e420 100644 --- a/lib/local-execution/include/local-execution/local_training_backing.h +++ b/lib/local-execution/include/local-execution/local_training_backing.h @@ -1,11 +1,13 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TRAINING_BACKING_H #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TRAINING_BACKING_H -#include "local-execution/local_slots_backing.h" +#include "local-execution/local_tensor_backing.h" +#include "local-execution/local_args_backing.h" #include "local-execution/task_registry.h" #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" #include "pcg/computation_graph.dtg.h" #include "pcg/optimizer_attrs.dtg.h" +#include "local-execution/optimizer_tensor_source.h" namespace FlexFlow { @@ -15,8 +17,6 @@ using PerLayerElapsedTime = struct LocalTrainingBacking { LocalTrainingBacking(Allocator const &, ComputationGraph const &, - LayerTensorBackingMap const &allocated_forward_tensors, - TensorBackingMap const &allocated_non_graph_tensors, RuntimeArgConfig const &); void register_and_allocate_layer(layer_guid_t const &); void allocate_layer_optimizer_tensors(layer_guid_t const &, @@ -25,17 +25,18 @@ struct LocalTrainingBacking { void execute_init(layer_guid_t const &); std::optional execute_forward(layer_guid_t const &); void compute_loss(LossAttrs const &loss_attrs, - reduced_tensor_t const &logit_tensor, - reduced_tensor_t const &label_tensor); + tensor_guid_t const &logit_tensor, + loss_tensor_t const &label_tensor); std::optional execute_backward(layer_guid_t const &); void execute_update(layer_guid_t const &, OptimizerAttrs const &); TaskArgumentAccessor - get_task_arg_accessor(TaskInvocation const &, - std::optional const &) const; - TaskArgumentAccessor get_op_task_arg_accessor(OpTaskInvocation const &, - layer_guid_t const &) const; - LocalSlotsBacking local_slots_backing; + get_task_arg_accessor(TaskInvocation const &) const; + + TaskInvocation lower_to_task_invocation(OpTaskInvocation const &, layer_guid_t const &) const; + + LocalTensorBacking local_tensor_backing; + LocalArgsBacking local_args_backing; private: DeviceSpecificDeviceStates call_init_task_impl(task_id_t, @@ -46,6 +47,10 @@ struct LocalTrainingBacking { Allocator allocator; ComputationGraph computation_graph; TaskRegistry task_registry; + + // optimizer + OptimizerTensorSource optimizer_tensor_source; + std::unordered_map> layer_optimizer_tensor_ids; }; } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/loss_functions.h b/lib/local-execution/include/local-execution/loss_functions.h index 4ce74da766..f56f2b05b1 100644 --- a/lib/local-execution/include/local-execution/loss_functions.h +++ b/lib/local-execution/include/local-execution/loss_functions.h @@ -20,13 +20,15 @@ #include "local-execution/task_invocation.dtg.h" #include "local-execution/task_signature.h" #include "op-attrs/ops/loss_functions.h" +#include "pcg/tensor_guid_t.dtg.h" +#include "local-execution/loss_tensor_t.dtg.h" namespace FlexFlow { TaskImplFunction get_loss_bwd_task_impl(); TaskSignature get_loss_bwd_signature(); TaskInvocation - backward(LossAttrs const &, reduced_tensor_t logit, reduced_tensor_t label); + backward(LossAttrs const &, tensor_guid_t logit, loss_tensor_t label); } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/reduced_tensor_t.struct.toml b/lib/local-execution/include/local-execution/loss_tensor_t.struct.toml similarity index 82% rename from lib/local-execution/include/local-execution/reduced_tensor_t.struct.toml rename to lib/local-execution/include/local-execution/loss_tensor_t.struct.toml index 726249c970..0d0d428a1b 100644 --- a/lib/local-execution/include/local-execution/reduced_tensor_t.struct.toml +++ b/lib/local-execution/include/local-execution/loss_tensor_t.struct.toml @@ -1,5 +1,5 @@ namespace = "FlexFlow" -name = "reduced_tensor_t" +name = "loss_tensor_t" features = [ "eq", "ord", diff --git a/lib/local-execution/include/local-execution/lowered_tensor_source.h b/lib/local-execution/include/local-execution/lowered_tensor_source.h new file mode 100644 index 0000000000..63cc2cd31e --- /dev/null +++ b/lib/local-execution/include/local-execution/lowered_tensor_source.h @@ -0,0 +1,21 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_LOWERED_TENSOR_SOURCE_H +#define _FLEXFLOW_LOCAL_EXECUTION_LOWERED_TENSOR_SOURCE_H + +#include "local-execution/lowered_tensor_t.dtg.h" + +namespace FlexFlow { + +struct LoweredTensorSource { +public: + LoweredTensorSource(); + + lowered_tensor_t new_lowered_tensor(); + +private: + static size_t next_available_lowered_tensor_id; +}; + +} // namespace FlexFlow + + +#endif diff --git a/lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml b/lib/local-execution/include/local-execution/lowered_tensor_t.struct.toml similarity index 62% rename from lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml rename to lib/local-execution/include/local-execution/lowered_tensor_t.struct.toml index 4832ecaafa..287e548a5b 100644 --- a/lib/local-execution/include/local-execution/non_graph_tensor_guid_t.struct.toml +++ b/lib/local-execution/include/local-execution/lowered_tensor_t.struct.toml @@ -1,13 +1,13 @@ namespace = "FlexFlow" -name = "non_graph_tensor_guid_t" +name = "lowered_tensor_t" features = [ "eq", "ord", "hash", "fmt", - "json", ] + [[fields]] -name = "raw_uid" +name = "raw_index" type = "int" diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h index 5cc13f0b40..dd6a6f33d7 100644 --- a/lib/local-execution/include/local-execution/model_training_instance.h +++ b/lib/local-execution/include/local-execution/model_training_instance.h @@ -3,6 +3,8 @@ #include "local-execution/local_training_backing.h" #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" +#include "pcg/tensor_guid_t.dtg.h" +#include "local-execution/loss_tensor_t.dtg.h" namespace FlexFlow { @@ -12,12 +14,10 @@ using PerLayerElapsedTime = struct ModelTrainingInstance { ModelTrainingInstance(Allocator const &, ComputationGraph const &, - LayerTensorBackingMap const &allocated_forward_tensors, - TensorBackingMap const &allocated_non_graph_tensors, RuntimeArgConfig const &, LossAttrs const &, - reduced_tensor_t const &logit_tensor, - reduced_tensor_t const &label_tensor, + tensor_guid_t const &logit_tensor, + loss_tensor_t const &label_tensor, OptimizerAttrs const &); void execute_init(); @@ -28,8 +28,8 @@ struct ModelTrainingInstance { ComputationGraph computation_graph; LocalTrainingBacking training_backing; LossAttrs loss_attrs; - reduced_tensor_t logit_tensor; - reduced_tensor_t label_tensor; + tensor_guid_t logit_tensor; + loss_tensor_t label_tensor; OptimizerAttrs optimizer_attrs; }; diff --git a/lib/local-execution/include/local-execution/op_task_invocation.h b/lib/local-execution/include/local-execution/op_task_invocation.h index 6484981ebf..0f351c3a0e 100644 --- a/lib/local-execution/include/local-execution/op_task_invocation.h +++ b/lib/local-execution/include/local-execution/op_task_invocation.h @@ -10,7 +10,7 @@ #include "local-execution/op_tensor_spec.h" #include "local-execution/profiling.h" #include "local-execution/runtime_arg_ref.h" -#include "local-execution/slot_tensor_type_id.dtg.h" +#include "local-execution/slot_grad_id.dtg.h" #include "local-execution/task_id_t.dtg.h" #include "local-execution/variadic_tensor_ref.h" #include @@ -84,14 +84,14 @@ struct OpTaskBinding { bool operator==(OpTaskBinding const &other) const; bool operator!=(OpTaskBinding const &other) const; - std::unordered_map const & + std::unordered_map const & get_tensor_bindings() const; std::unordered_map const &get_arg_bindings() const; void bind_from_forward(OpTaskBinding const &fwd); private: - std::unordered_map tensor_bindings; + std::unordered_map tensor_bindings; std::unordered_map arg_bindings; private: diff --git a/lib/local-execution/include/local-execution/op_tensor_slot_spec.struct.toml b/lib/local-execution/include/local-execution/op_tensor_slot_spec.struct.toml index 54638a7eb6..590dbe6362 100644 --- a/lib/local-execution/include/local-execution/op_tensor_slot_spec.struct.toml +++ b/lib/local-execution/include/local-execution/op_tensor_slot_spec.struct.toml @@ -11,7 +11,7 @@ includes = [ "local-execution/slot_id_t.dtg.h", "local-execution/slot_type.dtg.h", "local-execution/tensor_role.dtg.h", - "local-execution/tensor_type.dtg.h", + "local-execution/is_grad.dtg.h", "local-execution/op_slot_options.dtg.h", ] @@ -28,8 +28,8 @@ name = "tensor_role" type = "::FlexFlow::TensorRole" [[fields]] -name = "tensor_type" -type = "::FlexFlow::TensorType" +name = "is_grad" +type = "::FlexFlow::IsGrad" [[fields]] name = "slot_option" diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h index 2eb480a0c1..f0dd610a1f 100644 --- a/lib/local-execution/include/local-execution/optimizer.h +++ b/lib/local-execution/include/local-execution/optimizer.h @@ -1,7 +1,6 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_ #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_ -#include "local-execution/non_graph_tensor_guid_t.dtg.h" #include "local-execution/task_impl_function.dtg.h" #include "local-execution/task_invocation.dtg.h" #include "local-execution/task_signature.h" @@ -14,21 +13,21 @@ namespace FlexFlow { TaskSignature get_update_signature(OptimizerAttrs const &); TaskInvocation get_update_invocation( OptimizerAttrs const &, - reduced_tensor_t const &weight, - std::vector const &grad_buffer_tensors); + tensor_guid_t const &weight, + std::vector const &grad_buffer_tensors); TaskImplFunction get_update_task_impl(OptimizerAttrs const &); TaskSignature get_sgd_update_signature(); TaskInvocation sgd_update(SGDOptimizerAttrs const &, - reduced_tensor_t const &weight, - reduced_tensor_t const &sgd_v); + tensor_guid_t const &weight, + optimizer_tensor_t const &sgd_v); TaskImplFunction get_sgd_update_task_impl(); TaskSignature get_adam_update_signature(); TaskInvocation adam_update(AdamOptimizerAttrs const &, - reduced_tensor_t const &weight, - reduced_tensor_t const &adam_v, - reduced_tensor_t const &adam_m); + tensor_guid_t const &weight, + optimizer_tensor_t const &adam_v, + optimizer_tensor_t const &adam_m); TaskImplFunction get_adam_update_task_impl(); } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/optimizer_tensor_source.h b/lib/local-execution/include/local-execution/optimizer_tensor_source.h new file mode 100644 index 0000000000..fc5015b299 --- /dev/null +++ b/lib/local-execution/include/local-execution/optimizer_tensor_source.h @@ -0,0 +1,21 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_OPTIMIZER_TENSOR_SOURCE_H +#define _FLEXFLOW_LOCAL_EXECUTION_OPTIMIZER_TENSOR_SOURCE_H + +#include "local-execution/optimizer_tensor_t.dtg.h" + +namespace FlexFlow { + +struct OptimizerTensorSource { +public: + OptimizerTensorSource(); + + optimizer_tensor_t new_optimizer_tensor(); + +private: + static size_t next_available_optimizer_tensor_id; +}; + +} // namespace FlexFlow + + +#endif diff --git a/lib/local-execution/include/local-execution/optimizer_tensor_t.struct.toml b/lib/local-execution/include/local-execution/optimizer_tensor_t.struct.toml new file mode 100644 index 0000000000..5d3e05f673 --- /dev/null +++ b/lib/local-execution/include/local-execution/optimizer_tensor_t.struct.toml @@ -0,0 +1,13 @@ +namespace = "FlexFlow" +name = "optimizer_tensor_t" +features = [ + "eq", + "ord", + "hash", + "fmt", +] + + +[[fields]] +name = "raw_index" +type = "int" diff --git a/lib/local-execution/include/local-execution/slot_grad_id.struct.toml b/lib/local-execution/include/local-execution/slot_grad_id.struct.toml new file mode 100644 index 0000000000..256091d272 --- /dev/null +++ b/lib/local-execution/include/local-execution/slot_grad_id.struct.toml @@ -0,0 +1,21 @@ +namespace = "FlexFlow" +name = "SlotGradId" +features = [ + "eq", + "ord", + "hash", + "fmt", +] + +includes = [ + "local-execution/is_grad.dtg.h", + "local-execution/slot_id_t.dtg.h", +] + +[[fields]] +name = "slot_id" +type = "::FlexFlow::slot_id_t" + +[[fields]] +name = "is_grad" +type = "::FlexFlow::IsGrad" diff --git a/lib/local-execution/include/local-execution/task_binding.h b/lib/local-execution/include/local-execution/task_binding.h index e211592ea6..33636616b3 100644 --- a/lib/local-execution/include/local-execution/task_binding.h +++ b/lib/local-execution/include/local-execution/task_binding.h @@ -1,20 +1,32 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_BINDING_H #define _FLEXFLOW_LOCAL_EXECUTION_TASK_BINDING_H -#include "local-execution/reduced_tensor_t.dtg.h" +#include "local-execution/lowered_tensor_t.dtg.h" #include "local-execution/slot_id_t.dtg.h" #include "local-execution/slot_tensor_type_id.dtg.h" #include "local-execution/task_arg_spec.dtg.h" #include "local-execution/task_id_t.dtg.h" #include "local-execution/task_signature.dtg.h" +#include "local-execution/optimizer_tensor_t.dtg.h" +#include "local-execution/loss_tensor_t.dtg.h" +#include "local-execution/tensor_type_t.dtg.h" namespace FlexFlow { struct TaskBinding { TaskBinding() = default; - void bind(int, TensorType const &, reduced_tensor_t const &); - void bind(slot_id_t, TensorType const &, reduced_tensor_t const &); + void bind(int, tensor_guid_t const &); + void bind(slot_id_t, tensor_guid_t const &); + + void bind_grad(int, tensor_guid_t const &); + void bind_grad(slot_id_t, tensor_guid_t const &); + + void bind(int, optimizer_tensor_t const &); + void bind(slot_id_t, optimizer_tensor_t const &); + + void bind(int, loss_tensor_t const &); + void bind(slot_id_t, loss_tensor_t const &); template void bind_arg(int name, T const &t) { @@ -39,16 +51,16 @@ struct TaskBinding { bool operator==(TaskBinding const &other) const; bool operator!=(TaskBinding const &other) const; - std::unordered_map const & + std::unordered_map const & get_tensor_bindings() const; std::unordered_map const &get_arg_bindings() const; + void insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec); private: - std::unordered_map tensor_bindings; + std::unordered_map tensor_bindings; std::unordered_map arg_bindings; private: - void insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec); std::tuple tie() const; }; diff --git a/lib/local-execution/include/local-execution/tensor_reduction.h b/lib/local-execution/include/local-execution/tensor_lowering.h similarity index 67% rename from lib/local-execution/include/local-execution/tensor_reduction.h rename to lib/local-execution/include/local-execution/tensor_lowering.h index 2cb0b12ff0..5f3870c1d2 100644 --- a/lib/local-execution/include/local-execution/tensor_reduction.h +++ b/lib/local-execution/include/local-execution/tensor_lowering.h @@ -1,12 +1,12 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TENSOR_REDUCTION_H #define _FLEXFLOW_LOCAL_EXECUTION_TENSOR_REDUCTION_H -#include "local-execution/reduced_tensor_t.dtg.h" +#include "local-execution/lowered_tensor_t.dtg.h" #include "pcg/tensor_guid_t.dtg.h" namespace FlexFlow { -reduced_tensor_t lower(tensor_guid_t const &); +lowered_tensor_t lower(tensor_guid_t const &); } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/tensor_type.enum.toml b/lib/local-execution/include/local-execution/tensor_type.enum.toml index 31ce5ba83a..b1ae8fa667 100644 --- a/lib/local-execution/include/local-execution/tensor_type.enum.toml +++ b/lib/local-execution/include/local-execution/tensor_type.enum.toml @@ -8,7 +8,7 @@ features = [ ] [[values]] -name = "NON_GRAPH" +name = "LOSS" [[values]] name = "FORWARD" diff --git a/lib/local-execution/include/local-execution/tensor_type_t.variant.toml b/lib/local-execution/include/local-execution/tensor_type_t.variant.toml new file mode 100644 index 0000000000..d4e525c348 --- /dev/null +++ b/lib/local-execution/include/local-execution/tensor_type_t.variant.toml @@ -0,0 +1,26 @@ +namespace = "FlexFlow" +name = "TensorTypeVariant" +features = [ + "eq", + "ord", + "hash", + "fmt", +] + +includes = [ + "pcg/tensor_guid_t.dtg.h", + "local-execution/optimizer_tensor_t.dtg.h", + "local-execution/loss_tensor_t.dtg.h" +] + +[[values]] +type = "::FlexFlow::tensor_guid_t" +key = "tensor_guid" + +[[values]] +type = "::FlexFlow::optimizer_tensor_t" +key = "optimizer_tensor" + +[[values]] +type = "::FlexFlow::loss_tensor_t" +key = "loss_tensor" diff --git a/lib/local-execution/src/local_args_backing.cc b/lib/local-execution/src/local_args_backing.cc new file mode 100644 index 0000000000..0c3cfe70e8 --- /dev/null +++ b/lib/local-execution/src/local_args_backing.cc @@ -0,0 +1,62 @@ +#include "local-execution/local_args_backing.h" +#include "utils/containers/map_values.h" +#include "utils/containers/contains_key.h" +#include "utils/overload.h" +#include "op-attrs/parallel_tensor_shape.h" + +namespace FlexFlow { + + +void LocalArgsBacking::add_per_device_op_state( + layer_guid_t const &op_guid, + DeviceSpecificDeviceStates const &device_state) { + this->per_device_op_states.insert({op_guid, device_state}); +} + +ArgSlotsBacking LocalArgsBacking::construct_arg_slots_backing( + TaskBinding const &binding) const { + return map_values( + binding.get_arg_bindings(), [&](TaskArgSpec const &arg_binding) { + return arg_binding.template visit( + overload{[&](RuntimeArgRefSpec const &s) { + return this->lower_to_concrete_arg_spec(s); + }, + [](ConcreteArgSpec const &s) { return s; }}); + }); + ; +} + +ConcreteArgSpec LocalArgsBacking::lower_to_concrete_arg_spec( + OpArgRefSpec const &op_arg_ref_spec, ComputationGraph const & cg, layer_guid_t const &op_guid) const { + if (op_arg_ref_spec.holds()) { + assert(contains_key(this->per_device_op_states, op_guid)); + DeviceSpecificDeviceStates device_specific = + per_device_op_states.at(op_guid); + PerDeviceOpState device_state = + get_device_state_from_device_specific(device_specific, 0); + return ConcreteArgSpec::create(device_state); + } else if (op_arg_ref_spec.holds()) { + ParallelTensorShapeRefType index_op_arg_ref = + op_arg_ref_spec.get_ref_type().get(); + tensor_guid_t input_tensor = get_incoming_inputs(cg, op_guid).at(index_op_arg_ref.idx); + TensorAttrs tensor_attrs = get_tensor_attrs(cg, input_tensor); + ParallelTensorShape shape = lift_to_parallel(tensor_attrs.shape); + return ConcreteArgSpec::create(shape); + } else { + throw mk_runtime_error("Unhandled op arg ref type"); + } +} + +ConcreteArgSpec LocalArgsBacking::lower_to_concrete_arg_spec( + RuntimeArgRefSpec const &runtime_arg_ref_spec) const { + if (runtime_arg_ref_spec.holds>()) { + return ConcreteArgSpec::create( + *(this->runtime_arg_config.ff_handle.get(0))); + } else if (runtime_arg_ref_spec.holds()) { + return ConcreteArgSpec::create(this->runtime_arg_config.profiling_settings); + } else { + throw mk_runtime_error("Unhandled runtime arg ref type"); + } +} + +} diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc index 02265281b0..404064b7ce 100644 --- a/lib/local-execution/src/local_cost_estimator.cc +++ b/lib/local-execution/src/local_cost_estimator.cc @@ -1,5 +1,5 @@ #include "local-execution/local_cost_estimator.h" -#include "local-execution/tensor_reduction.h" +#include "local-execution/tensor_lowering.h" #include "kernels/device.h" #include "kernels/local_cuda_allocator.h" #include "local-execution/tracked_allocator.h" diff --git a/lib/local-execution/src/local_slots_backing.cc b/lib/local-execution/src/local_slots_backing.cc deleted file mode 100644 index 8a277adc78..0000000000 --- a/lib/local-execution/src/local_slots_backing.cc +++ /dev/null @@ -1,270 +0,0 @@ -#include "local-execution/local_slots_backing.h" -#include "local-execution/tensor_reduction.h" -#include "op-attrs/parallel_tensor_shape.h" -#include "pcg/computation_graph.h" -#include "utils/containers/contains_key.h" -#include "utils/containers/map_values.h" -#include "utils/overload.h" - -namespace FlexFlow { - -LocalSlotsBacking::LocalSlotsBacking( - LayerTensorBackingMap const &allocated_forward_tensors, - TensorBackingMap const &allocated_non_graph_tensors, - RuntimeArgConfig const &runtime_arg_config) - : tensor_mapping(allocated_forward_tensors), - non_graph_tensor_mapping(allocated_non_graph_tensors), - runtime_arg_config(runtime_arg_config){}; - -void LocalSlotsBacking::add_per_device_op_state( - layer_guid_t const &op_guid, - DeviceSpecificDeviceStates const &device_state) { - this->per_device_op_states.insert({op_guid, device_state}); -} - -void LocalSlotsBacking::allocate_layer_tensors( - layer_guid_t const &layer_guid, - ComputationGraph const &computation_graph, - Allocator &allocator) { - this->allocate_tensors_by_role( - TensorRole::INPUT, layer_guid, computation_graph, allocator); - this->allocate_tensors_by_role( - TensorRole::WEIGHT, layer_guid, computation_graph, allocator); - this->allocate_tensors_by_role( - TensorRole::OUTPUT, layer_guid, computation_graph, allocator); -} - -void LocalSlotsBacking::allocate_tensors_by_role( - TensorRole const &role, - layer_guid_t const &layer_guid, - ComputationGraph const &computation_graph, - Allocator &allocator) { - std::vector tensors; - switch (role) { - case TensorRole::INPUT: - tensors = get_incoming_inputs(computation_graph, layer_guid); - this->input_tensor_slots.insert({layer_guid, - transform(tensors, [&](tensor_guid_t const &tensor_guid) { - return lower(tensor_guid); - }) - }); - break; - case TensorRole::WEIGHT: - tensors = get_incoming_weights(computation_graph, layer_guid); - this->weight_tensor_slots.insert({layer_guid, - transform(tensors, [&](tensor_guid_t const &tensor_guid) { - return lower(tensor_guid); - }) - }); - break; - case TensorRole::OUTPUT: - tensors = get_outgoing_tensors(computation_graph, layer_guid); - this->output_tensor_slots.insert({layer_guid, - transform(tensors, [&](tensor_guid_t const &tensor_guid) { - return lower(tensor_guid); - }) - }); - break; - default: - throw mk_runtime_error("Invalid tensor role, got {}", role); - } - - for (tensor_guid_t const &tensor : tensors) { - TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor); - reduced_tensor_t reduced_tensor = lower(tensor); - LayerTensorKey layer_tensor_key = - LayerTensorKey{layer_guid, reduced_tensor}; - // tensor allocation - if (!is_forward_tensor_allocated(layer_tensor_key)) { - GenericTensorAccessorW tensor_backing = - allocator.allocate_tensor(tensor_attrs.shape); - this->tensor_mapping.insert({layer_tensor_key, tensor_backing}); - } - - // gradient tensor allocation - if (tensor_attrs.create_gradients == CreateGrad::YES) { - GenericTensorAccessorW gradient_tensor_backing = - allocator.allocate_tensor(tensor_attrs.shape); - this->gradient_tensor_mapping.insert( - {layer_tensor_key, gradient_tensor_backing}); - } - } -} - -void LocalSlotsBacking::allocate_optimizer_tensors( - layer_guid_t const &weight_layer, - tensor_guid_t const &weight, - ComputationGraph const &cg, - Allocator &allocator, - TaskSignature const &sig) { - GenericTensorAccessorW weight_backing = this->get_tensor_backing( - TensorType::FORWARD, lower(weight), weight_layer); - int num_grad_buffer_tensors = - sig.tensor_guid_slots.size() - 2; // ignore 2 (weight and weight_grad) - std::vector optimizer_buffer_tensors; - for (int i = 0; i < num_grad_buffer_tensors; ++i) { - reduced_tensor_t buffer_tensor = reduced_tensor_t{i}; - GenericTensorAccessorW buffer_backing = allocator.allocate_tensor( - get_tensor_shape(weight_backing.shape, weight_backing.data_type)); - this->optimizer_tensor_mapping.insert( - {LayerTensorKey{weight_layer, buffer_tensor}, buffer_backing}); - optimizer_buffer_tensors.push_back(buffer_tensor); - } - this->weight_optimizer_tensor_guids.insert( - {weight_layer, optimizer_buffer_tensors}); -} - -bool LocalSlotsBacking::is_forward_tensor_allocated( - LayerTensorKey const &layer_tensor_id) const { - return contains_key(this->tensor_mapping, layer_tensor_id); -} - -bool LocalSlotsBacking::is_non_graph_tensor_allocated( - reduced_tensor_t const &tensor_id) const { - return contains_key(this->non_graph_tensor_mapping, tensor_id); -} - -GenericTensorAccessorW const &LocalSlotsBacking::get_tensor_backing( - TensorType const &tensor_type, - reduced_tensor_t const &tensor_id, - std::optional const &layer_guid) const { - switch (tensor_type) { - case TensorType::FORWARD: - return this->tensor_mapping.at( - LayerTensorKey{layer_guid.value(), tensor_id}); - case TensorType::NON_GRAPH: - return this->non_graph_tensor_mapping.at(tensor_id); - case TensorType::GRADIENT: - return this->gradient_tensor_mapping.at( - LayerTensorKey{layer_guid.value(), tensor_id}); - case TensorType::OPTIMIZER: - return this->optimizer_tensor_mapping.at( - LayerTensorKey{layer_guid.value(), tensor_id}); - default: - throw mk_runtime_error( - fmt::format("Invalid tensor type {}", tensor_type)); - } -} - -TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( - OpTaskBinding const &binding, layer_guid_t const &op_guid) const { - TensorSlotsBacking mapping; - - for (auto const &tensor_binding : binding.get_tensor_bindings()) { - SlotTensorTypeId slot_grad_id = tensor_binding.first; - OpTensorSpec tensor_spec = tensor_binding.second; - std::vector tensor_guids; - int weight_adjusted_idx = 0; - switch (tensor_spec.role) { - case TensorRole::WEIGHT: - assert(contains_key(this->weight_tensor_slots, op_guid)); - tensor_guids = this->weight_tensor_slots.at(op_guid); - break; - case TensorRole::INPUT: - assert(contains_key(this->input_tensor_slots, op_guid)); - tensor_guids = this->input_tensor_slots.at(op_guid); - break; - case TensorRole::OUTPUT: - assert(contains_key(this->output_tensor_slots, op_guid)); - tensor_guids = this->output_tensor_slots.at(op_guid); - break; - default: - throw mk_runtime_error( - fmt::format("Invalid TensorRole {}", tensor_spec.role)); - } - - mapping.insert({slot_grad_id, - this->get_tensor_backing(slot_grad_id.tensor_type, - tensor_guids.at(tensor_spec.idx), - op_guid)}); - } - return mapping; -} - -TensorSlotsBacking LocalSlotsBacking::construct_tensor_slots_backing( - TaskBinding const &binding, - std::optional const &layer_guid) const { - TensorSlotsBacking mapping; - - for (auto const &tensor_binding : binding.get_tensor_bindings()) { - reduced_tensor_t tensor_id = tensor_binding.second; - SlotTensorTypeId slot_tensor_type_id = tensor_binding.first; - GenericTensorAccessorW accessor = this->get_tensor_backing( - slot_tensor_type_id.tensor_type, tensor_id, layer_guid); - mapping.insert({slot_tensor_type_id, accessor}); - } - - return mapping; -} - -ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing( - OpTaskBinding const &binding, layer_guid_t const &op_guid) const { - return map_values( - binding.get_arg_bindings(), [&](OpArgSpec const &arg_binding) { - return arg_binding.template visit( - overload{[&](OpArgRefSpec const &s) { - return this->resolve_op_arg_ref_spec(s, op_guid); - }, - [&](RuntimeArgRefSpec const &s) { - return this->resolve_runtime_arg_ref_spec(s); - }, - [](ConcreteArgSpec const &s) { return s; }}); - }); -} - -ArgSlotsBacking LocalSlotsBacking::construct_arg_slots_backing( - TaskBinding const &binding) const { - return map_values( - binding.get_arg_bindings(), [&](TaskArgSpec const &arg_binding) { - return arg_binding.template visit( - overload{[&](RuntimeArgRefSpec const &s) { - return this->resolve_runtime_arg_ref_spec(s); - }, - [](ConcreteArgSpec const &s) { return s; }}); - }); - ; -} - -ConcreteArgSpec LocalSlotsBacking::resolve_op_arg_ref_spec( - OpArgRefSpec const &op_arg_ref_spec, layer_guid_t const &op_guid) const { - if (op_arg_ref_spec.holds()) { - assert(contains_key(per_device_op_states, op_guid)); - DeviceSpecificDeviceStates device_specific = - per_device_op_states.at(op_guid); - PerDeviceOpState device_state = - get_device_state_from_device_specific(device_specific, 0); - return ConcreteArgSpec::create(device_state); - } else if (op_arg_ref_spec.holds()) { - ParallelTensorShapeRefType index_op_arg_ref = - op_arg_ref_spec.get_ref_type().get(); - - assert(contains_key(this->input_tensor_slots, op_guid)); - std::vector input_tensor_guids = - this->input_tensor_slots.at(op_guid); - - assert(input_tensor_guids.size() > index_op_arg_ref.idx); - GenericTensorAccessorW tensor_backing = - this->get_tensor_backing(TensorType::FORWARD, - input_tensor_guids.at(index_op_arg_ref.idx), - op_guid); - ParallelTensorShape shape = lift_to_parallel( - get_tensor_shape(tensor_backing.shape, tensor_backing.data_type)); - return ConcreteArgSpec::create(shape); - } else { - throw mk_runtime_error("Unhandled op arg ref type"); - } -} - -ConcreteArgSpec LocalSlotsBacking::resolve_runtime_arg_ref_spec( - RuntimeArgRefSpec const &runtime_arg_ref_spec) const { - if (runtime_arg_ref_spec.holds>()) { - return ConcreteArgSpec::create( - *(this->runtime_arg_config.ff_handle.get(0))); - } else if (runtime_arg_ref_spec.holds()) { - return ConcreteArgSpec::create(this->runtime_arg_config.profiling_settings); - } else { - throw mk_runtime_error("Unhandled runtime arg ref type"); - } -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/local_tensor_backing.cc b/lib/local-execution/src/local_tensor_backing.cc new file mode 100644 index 0000000000..9da74c27b9 --- /dev/null +++ b/lib/local-execution/src/local_tensor_backing.cc @@ -0,0 +1,123 @@ +#include "local-execution/local_tensor_backing.h" +#include "local-execution/tensor_lowering.h" +#include "op-attrs/parallel_tensor_shape.h" +#include "pcg/computation_graph.h" +#include "utils/containers/contains_key.h" +#include "utils/overload.h" +#include "local-execution/slot_grad_id.dtg.h" + +namespace FlexFlow { + +LocalTensorBacking::LocalTensorBacking() {}; + +void LocalTensorBacking::allocate_layer_tensors( + layer_guid_t const &layer_guid, + ComputationGraph const &computation_graph, + Allocator &allocator) { + this->allocate_tensors_by_role( + TensorRole::INPUT, layer_guid, computation_graph, allocator); + this->allocate_tensors_by_role( + TensorRole::WEIGHT, layer_guid, computation_graph, allocator); + this->allocate_tensors_by_role( + TensorRole::OUTPUT, layer_guid, computation_graph, allocator); +} + +void LocalTensorBacking::allocate_tensors_by_role( + TensorRole const &role, + layer_guid_t const &layer_guid, + ComputationGraph const &computation_graph, + Allocator &allocator) { + std::vector tensors; + switch (role) { + case TensorRole::INPUT: + tensors = get_incoming_inputs(computation_graph, layer_guid); + break; + case TensorRole::WEIGHT: + tensors = get_incoming_weights(computation_graph, layer_guid); + break; + case TensorRole::OUTPUT: + tensors = get_outgoing_tensors(computation_graph, layer_guid); + break; + default: + throw mk_runtime_error("Invalid tensor role, got {}", role); + } + + for (tensor_guid_t const &tensor : tensors) { + TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor); + // tensor allocation + if (!contains_key(this->tensor_lowering_mapping, tensor)) { + lowered_tensor_t reduced_tensor = this->lowered_tensor_source.new_lowered_tensor(); + this->tensor_lowering_mapping.insert({tensor, reduced_tensor}); + GenericTensorAccessorW tensor_backing = + allocator.allocate_tensor(tensor_attrs.shape); + this->tensor_backings.insert({reduced_tensor, tensor_backing}); + } + + // gradient tensor allocation + if (tensor_attrs.create_gradients == CreateGrad::YES && !contains_key(this->gradient_tensor_lowering_mapping, tensor)) { + lowered_tensor_t reduced_tensor = this->lowered_tensor_source.new_lowered_tensor(); + this->gradient_tensor_lowering_mapping.insert({tensor, reduced_tensor}); + GenericTensorAccessorW gradient_tensor_backing = + allocator.allocate_tensor(tensor_attrs.shape); + this->tensor_backings.insert( + {reduced_tensor, gradient_tensor_backing}); + } + } +} + +void LocalTensorBacking::allocate_optimizer_tensors( + tensor_guid_t const &weight, + std::vector const& optimizer_tensors, + Allocator &allocator) { + GenericTensorAccessorW weight_backing = this->get_tensor_backing(this->tensor_lowering_mapping.at(weight)); + for (optimizer_tensor_t const & optimizer_tensor: optimizer_tensors) { + // optimizer tensor allocation + if (!contains_key(this->optimizer_tensor_lowering_mapping, optimizer_tensor)) { + lowered_tensor_t buffer_tensor = this->lowered_tensor_source.new_lowered_tensor(); + this->optimizer_tensor_lowering_mapping.insert({optimizer_tensor, buffer_tensor}); + GenericTensorAccessorW buffer_backing = allocator.allocate_tensor( + get_tensor_shape(weight_backing.shape, weight_backing.data_type)); + this->tensor_backings.insert({buffer_tensor, buffer_backing}); + } + } +} + +bool LocalTensorBacking::is_tensor_allocated(lowered_tensor_t const & tensor_id) const { + return contains_key(tensor_backings, tensor_id); +} + +GenericTensorAccessorW const &LocalTensorBacking::get_tensor_backing( + lowered_tensor_t const &tensor_id) const { + return this->tensor_backings.at(tensor_id); +} + +TensorSlotsBacking LocalTensorBacking::construct_tensor_slots_backing( + TaskBinding const &binding) const { + TensorSlotsBacking mapping; + + for (auto const &tensor_binding : binding.get_tensor_bindings()) { + SlotTensorTypeId slot_tensor_type_id = tensor_binding.first; + + lowered_tensor_t tensor_id = [&] { + TensorTypeVariant tensor_type = tensor_binding.second; + if (tensor_type.has() and slot_tensor_type_id.tensor_type == TensorType::FORWARD) { + return this->tensor_lowering_mapping.at(tensor_type.get()); + } else if (tensor_type.has() and slot_tensor_type_id.tensor_type == TensorType::GRADIENT) { + return this->gradient_tensor_lowering_mapping.at(tensor_type.get()); + } else if (tensor_type.has()) { + return this->optimizer_tensor_lowering_mapping.at(tensor_type.get()); + } else if (tensor_type.has()) { + return this->loss_tensor_lowering_mapping.at(tensor_type.get()); + } else { + throw mk_runtime_error(fmt::format("Tensor binding has invalid type")); + } + }(); + + GenericTensorAccessorW accessor = this->get_tensor_backing(tensor_id); + mapping.insert({slot_tensor_type_id, accessor}); + } + + return mapping; +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index f02a8c7824..9b933dee9c 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -3,12 +3,13 @@ #include "local-execution/optimizer.h" #include "local-execution/task_invocation.h" #include "local-execution/task_signature_impl.h" -#include "local-execution/tensor_reduction.h" +#include "local-execution/tensor_lowering.h" #include "pcg/computation_graph.h" #include "pcg/optimizer_attrs.h" #include "utils/containers/contains.h" #include "utils/containers/contains_key.h" #include "utils/containers/get_only.h" +#include "utils/containers/values.h" #include "utils/exception.h" namespace FlexFlow { @@ -16,20 +17,16 @@ namespace FlexFlow { LocalTrainingBacking::LocalTrainingBacking( Allocator const &allocator, ComputationGraph const &computation_graph, - LayerTensorBackingMap const &allocated_forward_tensors, - TensorBackingMap const &allocated_non_graph_tensors, RuntimeArgConfig const &runtime_arg_config) : allocator(allocator), computation_graph(computation_graph), - local_slots_backing(allocated_forward_tensors, - allocated_non_graph_tensors, - runtime_arg_config), - task_registry(empty_task_registry()) {} + local_args_backing(runtime_arg_config), + task_registry(empty_task_registry()) {}; void LocalTrainingBacking::register_and_allocate_layer( layer_guid_t const &node) { ComputationGraphOpAttrs attrs = get_layer_attrs(this->computation_graph, node).attrs; - this->local_slots_backing.allocate_layer_tensors( + this->local_tensor_backing.allocate_layer_tensors( node, this->computation_graph, this->allocator); register_tasks_for_layer(this->task_registry, node, attrs); } @@ -42,8 +39,14 @@ void LocalTrainingBacking::allocate_layer_optimizer_tensors( TaskSignature sig = get_update_signature(optimizer_attrs); tensor_guid_t weight_tensor = get_only(get_outgoing_tensors(this->computation_graph, node)); - this->local_slots_backing.allocate_optimizer_tensors( - node, weight_tensor, this->computation_graph, this->allocator, sig); + + std::vector optimizer_tensors; + for (TensorTypeSlotSpec const & tensor_type_slot_spec: values(sig.tensor_guid_slots)) { + optimizer_tensors.push_back(this->optimizer_tensor_source.new_optimizer_tensor()); + } + this->layer_optimizer_tensor_ids.insert({node, optimizer_tensors}); + this->local_tensor_backing.allocate_optimizer_tensors( + weight_tensor, optimizer_tensors, this->allocator); } } @@ -73,12 +76,12 @@ void LocalTrainingBacking::execute_init(layer_guid_t const &operator_node) { ComputationGraphOpAttrs attrs = get_layer_attrs(this->computation_graph, operator_node).attrs; - OpTaskInvocation invocation = init(attrs); + TaskInvocation invocation = this->lower_to_task_invocation(init(attrs)); TaskArgumentAccessor accessor = - this->get_op_task_arg_accessor(invocation, operator_node); + this->get_task_arg_accessor(invocation); DeviceSpecificDeviceStates device_state = this->call_init_task_impl(invocation.task_id, accessor); - this->local_slots_backing.add_per_device_op_state(operator_node, + this->local_args_backing.add_per_device_op_state(operator_node, device_state); } } @@ -90,9 +93,9 @@ std::optional ComputationGraphOpAttrs attrs = get_layer_attrs(this->computation_graph, operator_node).attrs; - OpTaskInvocation invocation = forward(attrs); + TaskInvocation invocation = this->lower_to_task_invocation(forward(attrs)); TaskArgumentAccessor accessor = - this->get_op_task_arg_accessor(invocation, operator_node); + this->get_task_arg_accessor(invocation); return this->call_task_impl(invocation.task_id, accessor); } else { return std::nullopt; @@ -100,15 +103,14 @@ std::optional } void LocalTrainingBacking::compute_loss(LossAttrs const &loss_attrs, - reduced_tensor_t const &logit_tensor, - reduced_tensor_t const &label_tensor) { - assert(this->local_slots_backing.is_non_graph_tensor_allocated(label_tensor)); + tensor_guid_t const &logit_tensor, + loss_tensor_t const &label_tensor) { TaskInvocation loss_invocation = backward(loss_attrs, logit_tensor, label_tensor); // TODO: https://github.com/flexflow/flexflow-train/issues/1442 // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation)); TaskArgumentAccessor loss_accessor = - this->get_task_arg_accessor(loss_invocation, std::nullopt); + this->get_task_arg_accessor(loss_invocation); TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl(); loss_impl_fn.get().function_ptr(loss_accessor); } @@ -120,9 +122,9 @@ std::optional ComputationGraphOpAttrs attrs = get_layer_attrs(this->computation_graph, operator_node).attrs; - OpTaskInvocation invocation = backward(attrs); + TaskInvocation invocation = this->lower_to_task_invocation(backward(attrs)); TaskArgumentAccessor accessor = - this->get_op_task_arg_accessor(invocation, operator_node); + this->get_task_arg_accessor(invocation); return this->call_task_impl(invocation.task_id, accessor); } else { return std::nullopt; @@ -134,10 +136,8 @@ void LocalTrainingBacking::execute_update( LayerAttrs layer_attrs = get_layer_attrs(this->computation_graph, node); if (layer_attrs.attrs.has()) { // get tensors - reduced_tensor_t weight_tensor = - lower(get_only(get_outgoing_tensors(this->computation_graph, node))); - std::vector optimizer_buffer_tensors = - this->local_slots_backing.weight_optimizer_tensor_guids.at(node); + tensor_guid_t weight_tensor = get_only(get_outgoing_tensors(this->computation_graph, node)); + std::vector optimizer_buffer_tensors = this->layer_optimizer_tensor_ids.at(node); // get invocation TaskInvocation invocation = get_update_invocation( @@ -148,35 +148,62 @@ void LocalTrainingBacking::execute_update( // execute update TaskArgumentAccessor accessor = - this->get_task_arg_accessor(invocation, node); + this->get_task_arg_accessor(invocation); TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs); update_impl_fn.get().function_ptr(accessor); } } TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor( - TaskInvocation const &invocation, - std::optional const &layer_guid) const { + TaskInvocation const &invocation) const { TensorSlotsBacking tensor_slots_backing = - this->local_slots_backing.construct_tensor_slots_backing( - invocation.binding, layer_guid); + this->local_tensor_backing.construct_tensor_slots_backing( + invocation.binding); ArgSlotsBacking arg_slots_backing = - this->local_slots_backing.construct_arg_slots_backing(invocation.binding); + this->local_args_backing.construct_arg_slots_backing(invocation.binding); return TaskArgumentAccessor::create( this->allocator, tensor_slots_backing, arg_slots_backing); } -TaskArgumentAccessor LocalTrainingBacking::get_op_task_arg_accessor( - OpTaskInvocation const &invocation, layer_guid_t const &op_guid) const { - TensorSlotsBacking tensor_slots_backing = - this->local_slots_backing.construct_tensor_slots_backing( - invocation.binding, op_guid); - ArgSlotsBacking arg_slots_backing = - this->local_slots_backing.construct_arg_slots_backing(invocation.binding, - op_guid); +TaskInvocation LocalTrainingBacking::lower_to_task_invocation(OpTaskInvocation const & op_task_invocation, layer_guid_t const & layer_guid) const { + TaskBinding binding; + // tensors + for (auto const & tensor_binding: op_task_invocation.binding.get_tensor_bindings()) { + tensor_guid_t tensor_to_bind = [&] { + switch (tensor_binding.second.role) { + case TensorRole::INPUT: + return get_incoming_inputs(this->computation_graph, layer_guid).at(tensor_binding.second.idx); + case TensorRole::OUTPUT: + return get_outgoing_tensors(this->computation_graph, layer_guid).at(tensor_binding.second.idx); + case TensorRole::WEIGHT: + return get_incoming_weights(this->computation_graph, layer_guid).at(tensor_binding.second.idx); + default: + throw mk_runtime_error(fmt::format("Invalid tensor role {}", tensor_binding.second.role)); + } + }(); + + if (tensor_binding.first.is_grad == IsGrad::NO) { + binding.bind(tensor_binding.first.slot_id, tensor_to_bind); + } else if (tensor_binding.first.is_grad == IsGrad::YES) { + binding.bind_grad(tensor_binding.first.slot_id, tensor_to_bind); + } else { + throw mk_runtime_error(fmt::format("Invalid value for IsGrad {}", tensor_binding.first.is_grad)); + } + } - return TaskArgumentAccessor::create( - this->allocator, tensor_slots_backing, arg_slots_backing); + // args + for (auto const & arg_binding: op_task_invocation.binding.get_arg_bindings()) { + if (arg_binding.second.has()) { + ConcreteArgSpec concrete_arg = this->local_args_backing.lower_to_concrete_arg_spec(arg_binding.second.get(), this->computation_graph, layer_guid); + binding.insert_arg_spec(arg_binding.first, TaskArgSpec{concrete_arg}); + } else if (arg_binding.second.has()) { + binding.insert_arg_spec(arg_binding.first, TaskArgSpec{arg_binding.second.get()}); + } else { + binding.insert_arg_spec(arg_binding.first, TaskArgSpec{arg_binding.second.get()}); + } + } + + return TaskInvocation{op_task_invocation.task_id, binding}; } } // namespace FlexFlow diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc index e54841acb5..bfb3c0a32b 100644 --- a/lib/local-execution/src/loss_functions.cc +++ b/lib/local-execution/src/loss_functions.cc @@ -24,8 +24,8 @@ enum Slots { LOGIT, LABEL, ATTRS, PROFILING }; TaskSignature get_loss_bwd_signature() { TaskSignature sig = make_empty_task_signature(); - add_slot(sig, LOGIT, TensorType::NON_GRAPH); - add_slot(sig, LABEL, TensorType::NON_GRAPH); + add_slot(sig, LOGIT, TensorType::FORWARD); + add_slot(sig, LABEL, TensorType::LOSS); add_slot(sig, LOGIT, TensorType::GRADIENT); add_arg_slot(sig, ATTRS); @@ -34,12 +34,12 @@ TaskSignature get_loss_bwd_signature() { } TaskInvocation backward(LossAttrs const &attrs, - reduced_tensor_t logit, - reduced_tensor_t label) { + tensor_guid_t logit, + loss_tensor_t label) { TaskBinding b; - b.bind(LOGIT, TensorType::NON_GRAPH, logit); - b.bind(LABEL, TensorType::NON_GRAPH, label); - b.bind(LOGIT, TensorType::GRADIENT, logit); + b.bind(LOGIT, logit); + b.bind(LABEL, label); + b.bind_grad(LOGIT, logit); b.bind_arg(ATTRS, attrs); b.bind_arg(PROFILING, profiling_settings()); diff --git a/lib/local-execution/src/lowered_tensor_source.cc b/lib/local-execution/src/lowered_tensor_source.cc new file mode 100644 index 0000000000..05960ff5e2 --- /dev/null +++ b/lib/local-execution/src/lowered_tensor_source.cc @@ -0,0 +1,13 @@ +#include "local-execution/lowered_tensor_source.h" + +namespace FlexFlow { + +size_t LoweredTensorSource::next_available_lowered_tensor_id = 0; + +LoweredTensorSource::LoweredTensorSource() {} + +lowered_tensor_t LoweredTensorSource::new_lowered_tensor() { + return lowered_tensor_t{LoweredTensorSource::next_available_lowered_tensor_id++}; +} + +} diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc index 4815de5e85..f57c5db73a 100644 --- a/lib/local-execution/src/model_training_instance.cc +++ b/lib/local-execution/src/model_training_instance.cc @@ -8,18 +8,14 @@ namespace FlexFlow { ModelTrainingInstance::ModelTrainingInstance( Allocator const &allocator, ComputationGraph const &computation_graph, - LayerTensorBackingMap const &allocated_forward_tensors, - TensorBackingMap const &allocated_non_graph_tensors, RuntimeArgConfig const &runtime_arg_config, LossAttrs const &loss_attrs, - reduced_tensor_t const &logit_tensor, - reduced_tensor_t const &label_tensor, + tensor_guid_t const &logit_tensor, + loss_tensor_t const &label_tensor, OptimizerAttrs const &optimizer_attrs) : computation_graph(computation_graph), training_backing(allocator, computation_graph, - allocated_forward_tensors, - allocated_non_graph_tensors, runtime_arg_config), loss_attrs(loss_attrs), logit_tensor(logit_tensor), label_tensor(label_tensor), optimizer_attrs(optimizer_attrs) { diff --git a/lib/local-execution/src/op_task_invocation.cc b/lib/local-execution/src/op_task_invocation.cc index 81bf185911..b6771e6eb8 100644 --- a/lib/local-execution/src/op_task_invocation.cc +++ b/lib/local-execution/src/op_task_invocation.cc @@ -21,7 +21,7 @@ void OpTaskBinding::bind(int slot, OpTensorSpec const &tensor_spec) { void OpTaskBinding::bind(slot_id_t slot, OpTensorSpec const &tensor_spec) { this->tensor_bindings.insert( - {SlotTensorTypeId{slot, TensorType::FORWARD}, tensor_spec}); + {SlotGradId{slot, IsGrad::NO}, tensor_spec}); } void OpTaskBinding::bind_grad(int slot, OpTensorSpec const &tensor_spec) { @@ -30,7 +30,7 @@ void OpTaskBinding::bind_grad(int slot, OpTensorSpec const &tensor_spec) { void OpTaskBinding::bind_grad(slot_id_t slot, OpTensorSpec const &tensor_spec) { this->tensor_bindings.insert( - {SlotTensorTypeId{slot, TensorType::GRADIENT}, tensor_spec}); + {SlotGradId{slot, IsGrad::YES}, tensor_spec}); } void OpTaskBinding::insert_arg_spec(slot_id_t name, OpArgSpec const &arg_spec) { @@ -46,13 +46,13 @@ bool OpTaskBinding::operator!=(OpTaskBinding const &other) const { return this->tie() != other.tie(); } -std::tuple const &, +std::tuple const &, std::unordered_map const &> OpTaskBinding::tie() const { return std::tie(this->tensor_bindings, this->arg_bindings); } -std::unordered_map const & +std::unordered_map const & OpTaskBinding::get_tensor_bindings() const { return this->tensor_bindings; } @@ -91,8 +91,8 @@ bool is_tensor_invocation_valid(OpTaskSignature const &sig, OpTaskInvocation const &inv) { auto tensor_bindings = inv.binding.get_tensor_bindings(); for (OpTensorSlotSpec const &op_tensor_slot_spec : sig.get_tensor_slots()) { - SlotTensorTypeId tensor_key = SlotTensorTypeId{ - op_tensor_slot_spec.name, op_tensor_slot_spec.tensor_type}; + SlotGradId tensor_key = SlotGradId{ + op_tensor_slot_spec.name, op_tensor_slot_spec.is_grad}; OpTensorSpec op_tensor_spec = tensor_bindings.at(tensor_key); if (is_op_tensor_spec_invalid(op_tensor_slot_spec, op_tensor_spec)) { return false; diff --git a/lib/local-execution/src/op_task_signature.cc b/lib/local-execution/src/op_task_signature.cc index 5c8b19265a..69b5463a0d 100644 --- a/lib/local-execution/src/op_task_signature.cc +++ b/lib/local-execution/src/op_task_signature.cc @@ -16,7 +16,7 @@ void OpTaskSignature::add_input_slot(slot_id_t name, SlotType slot_type) { OpTensorSlotSpec{name, slot_type, TensorRole::INPUT, - TensorType::FORWARD, + IsGrad::NO, OpSlotOptions::NECESSARY}; this->op_tensor_slots.insert(op_tensor_slot_spec); } @@ -31,7 +31,7 @@ void OpTaskSignature::add_optional_input_slot(slot_id_t name, OpTensorSlotSpec{name, slot_type, TensorRole::INPUT, - TensorType::FORWARD, + IsGrad::NO, OpSlotOptions::OPTIONAL}; this->op_tensor_slots.insert(op_tensor_slot_spec); } @@ -46,7 +46,7 @@ void OpTaskSignature::add_untrainable_input_slot(slot_id_t name, OpTensorSlotSpec{name, slot_type, TensorRole::INPUT, - TensorType::FORWARD, + IsGrad::NO, OpSlotOptions::UNTRAINABLE}; this->op_tensor_slots.insert(op_tensor_slot_spec); } @@ -62,7 +62,7 @@ void OpTaskSignature::add_optional_untrainable_input_slot(slot_id_t name, OpTensorSlotSpec{name, slot_type, TensorRole::INPUT, - TensorType::FORWARD, + IsGrad::NO, OpSlotOptions::OPTIONAL_UNTRAINABLE}; this->op_tensor_slots.insert(op_tensor_slot_spec); } @@ -76,7 +76,7 @@ void OpTaskSignature::add_output_slot(slot_id_t name, SlotType slot_type) { OpTensorSlotSpec{name, slot_type, TensorRole::OUTPUT, - TensorType::FORWARD, + IsGrad::NO, OpSlotOptions::NECESSARY}; this->op_tensor_slots.insert(op_tensor_slot_spec); } @@ -92,7 +92,7 @@ void OpTaskSignature::add_bwd_optional_output_slot(slot_id_t name, OpTensorSlotSpec{name, slot_type, TensorRole::OUTPUT, - TensorType::FORWARD, + IsGrad::NO, OpSlotOptions::OPTIONAL}; this->op_tensor_slots.insert(op_tensor_slot_spec); } @@ -106,7 +106,7 @@ void OpTaskSignature::add_weight_slot(slot_id_t name, SlotType slot_type) { OpTensorSlotSpec{name, slot_type, TensorRole::WEIGHT, - TensorType::FORWARD, + IsGrad::NO, OpSlotOptions::NECESSARY}; this->op_tensor_slots.insert(op_tensor_slot_spec); } @@ -121,7 +121,7 @@ void OpTaskSignature::add_optional_weight_slot(slot_id_t name, OpTensorSlotSpec{name, slot_type, TensorRole::WEIGHT, - TensorType::FORWARD, + IsGrad::NO, OpSlotOptions::OPTIONAL}; this->op_tensor_slots.insert(op_tensor_slot_spec); } @@ -146,7 +146,7 @@ OpTaskSignature infer_bwd_signature(OpTaskSignature const &fwd) { OpTensorSlotSpec{op_tensor_slot_spec.name, op_tensor_slot_spec.slot_type, op_tensor_slot_spec.tensor_role, - TensorType::GRADIENT, + IsGrad::YES, op_tensor_slot_spec.slot_option}; bwd.op_tensor_slots.insert(grad_spec); } diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc index 5c0d6c54f2..94584dfc95 100644 --- a/lib/local-execution/src/optimizer.cc +++ b/lib/local-execution/src/optimizer.cc @@ -22,14 +22,14 @@ TaskSignature get_sgd_update_signature() { } TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs, - reduced_tensor_t const &weight, - reduced_tensor_t const &sgd_v) { + tensor_guid_t const &weight, + optimizer_tensor_t const &sgd_v) { TaskBinding b; - b.bind(WEIGHT, TensorType::FORWARD, weight); - b.bind(WEIGHT, TensorType::GRADIENT, weight); + b.bind(WEIGHT, weight); + b.bind_grad(WEIGHT, weight); if (attrs.momentum > 0.0f) { - b.bind(SGD_V, TensorType::OPTIMIZER, sgd_v); + b.bind(SGD_V, sgd_v); } b.bind_arg(ATTRS, attrs); b.bind_arg(PROFILING, profiling_settings()); @@ -113,14 +113,14 @@ TaskSignature get_adam_update_signature() { } TaskInvocation adam_update(AdamOptimizerAttrs const &attrs, - reduced_tensor_t const &weight, - reduced_tensor_t const &adam_v, - reduced_tensor_t const &adam_m) { + tensor_guid_t const &weight, + optimizer_tensor_t const &adam_v, + optimizer_tensor_t const &adam_m) { TaskBinding b; - b.bind(WEIGHT, TensorType::FORWARD, weight); - b.bind(WEIGHT, TensorType::GRADIENT, weight); - b.bind(ADAM_M, TensorType::OPTIMIZER, adam_m); - b.bind(ADAM_V, TensorType::OPTIMIZER, adam_v); + b.bind(WEIGHT, weight); + b.bind_grad(WEIGHT, weight); + b.bind(ADAM_M, adam_m); + b.bind(ADAM_V, adam_v); b.bind_arg(ATTRS, attrs); b.bind_arg(PROFILING, profiling_settings()); @@ -194,8 +194,8 @@ TaskSignature get_update_signature(OptimizerAttrs const &attrs) { TaskInvocation get_update_invocation( OptimizerAttrs const &attrs, - reduced_tensor_t const &weight, - std::vector const &grad_buffer_tensors) { + tensor_guid_t const &weight, + std::vector const &grad_buffer_tensors) { return attrs.visit(overload{ [&](SGDOptimizerAttrs const &s) { return sgd_update(s, weight, grad_buffer_tensors.at(0)); diff --git a/lib/local-execution/src/optimizer_tensor_source.cc b/lib/local-execution/src/optimizer_tensor_source.cc new file mode 100644 index 0000000000..8adb8ec07b --- /dev/null +++ b/lib/local-execution/src/optimizer_tensor_source.cc @@ -0,0 +1,13 @@ +#include "local-execution/optimizer_tensor_source.h" + +namespace FlexFlow { + +size_t OptimizerTensorSource::next_available_optimizer_tensor_id = 0; + +OptimizerTensorSource::OptimizerTensorSource() {} + +optimizer_tensor_t OptimizerTensorSource::new_optimizer_tensor() { + return optimizer_tensor_t{OptimizerTensorSource::next_available_optimizer_tensor_id++}; +} + +} diff --git a/lib/local-execution/src/task_binding.cc b/lib/local-execution/src/task_binding.cc index 2b1256df90..6fc8449f0b 100644 --- a/lib/local-execution/src/task_binding.cc +++ b/lib/local-execution/src/task_binding.cc @@ -2,19 +2,48 @@ #include "utils/containers/contains_key.h" #include "utils/fmt/unordered_map.h" #include "utils/hash/unordered_map.h" +#include "pcg/tensor_guid_t.dtg.h" namespace FlexFlow { void TaskBinding::bind(int name, - TensorType const &tensor_type, - reduced_tensor_t const &binding) { - this->bind(slot_id_t{name}, tensor_type, binding); + tensor_guid_t const &binding) { + this->bind(slot_id_t{name}, binding); } void TaskBinding::bind(slot_id_t name, - TensorType const &tensor_type, - reduced_tensor_t const &binding) { - this->tensor_bindings.insert({SlotTensorTypeId{name, tensor_type}, binding}); + tensor_guid_t const &binding) { + this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::FORWARD}, TensorTypeVariant{binding}}); +} + +void TaskBinding::bind_grad(int name, + tensor_guid_t const &binding) { + this->bind(slot_id_t{name}, binding); +} + +void TaskBinding::bind_grad(slot_id_t name, + tensor_guid_t const &binding) { + this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::GRADIENT}, TensorTypeVariant{binding}}); +} + +void TaskBinding::bind(int name, + optimizer_tensor_t const &binding) { + this->bind(slot_id_t{name}, binding); +} + +void TaskBinding::bind(slot_id_t name, + optimizer_tensor_t const &binding) { + this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::OPTIMIZER}, TensorTypeVariant{binding}}); +} + +void TaskBinding::bind(int name, + loss_tensor_t const &binding) { + this->bind(slot_id_t{name}, binding); +} + +void TaskBinding::bind(slot_id_t name, + loss_tensor_t const &binding) { + this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::LOSS}, TensorTypeVariant{binding}}); } void TaskBinding::insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec) { @@ -30,13 +59,13 @@ bool TaskBinding::operator!=(TaskBinding const &other) const { return this->tie() != other.tie(); } -std::tuple const &, +std::tuple const &, std::unordered_map const &> TaskBinding::tie() const { return std::tie(this->tensor_bindings, this->arg_bindings); } -std::unordered_map const & +std::unordered_map const & TaskBinding::get_tensor_bindings() const { return this->tensor_bindings; } diff --git a/lib/local-execution/src/tensor_lowering.cc b/lib/local-execution/src/tensor_lowering.cc new file mode 100644 index 0000000000..63be366d94 --- /dev/null +++ b/lib/local-execution/src/tensor_lowering.cc @@ -0,0 +1,10 @@ +#include "local-execution/tensor_lowering.h" +#include "utils/containers/transform.h" + +namespace FlexFlow { + +lowered_tensor_t lower(tensor_guid_t const &tensor_guid) { + return lowered_tensor_t{tensor_guid.raw_graph_output.node.raw_uid}; +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/tensor_reduction.cc b/lib/local-execution/src/tensor_reduction.cc deleted file mode 100644 index ae5b188dfd..0000000000 --- a/lib/local-execution/src/tensor_reduction.cc +++ /dev/null @@ -1,10 +0,0 @@ -#include "local-execution/tensor_reduction.h" -#include "utils/containers/transform.h" - -namespace FlexFlow { - -reduced_tensor_t lower(tensor_guid_t const &tensor_guid) { - return reduced_tensor_t{tensor_guid.raw_graph_output.node.raw_uid}; -} - -} // namespace FlexFlow From 66d61eb11c1f6566be7b005261d0a7b29f0fc4dc Mon Sep 17 00:00:00 2001 From: fruitea Date: Tue, 21 Jan 2025 02:49:27 -0800 Subject: [PATCH 26/91] feat: add realm-backend subdir --- .proj.toml | 1 + lib/realm-backend/CMakeLists.txt | 20 ++++++++++++++++++++ lib/realm-backend/test/CMakeLists.txt | 14 ++++++++++++++ 3 files changed, 35 insertions(+) create mode 100644 lib/realm-backend/CMakeLists.txt create mode 100644 lib/realm-backend/test/CMakeLists.txt diff --git a/.proj.toml b/.proj.toml index 10307a6efa..c895fcecc4 100644 --- a/.proj.toml +++ b/.proj.toml @@ -12,6 +12,7 @@ build_targets = [ "compiler", "substitution-generator", "local-execution", + "realm-backend", "models", "export-model-arch", "substitution-to-dot", diff --git a/lib/realm-backend/CMakeLists.txt b/lib/realm-backend/CMakeLists.txt new file mode 100644 index 0000000000..623816567e --- /dev/null +++ b/lib/realm-backend/CMakeLists.txt @@ -0,0 +1,20 @@ +ff_add_library( + NAME + realm-backend + SRC_PATTERNS + src/*.cc + PUBLIC_INCLUDE + include/ + PRIVATE_INCLUDE + src/ + DEPS + op-attrs + utils + kernels + local-execution + pcg + spdlog + legion +) + +add_subdirectory(test) diff --git a/lib/realm-backend/test/CMakeLists.txt b/lib/realm-backend/test/CMakeLists.txt new file mode 100644 index 0000000000..965f2e04b2 --- /dev/null +++ b/lib/realm-backend/test/CMakeLists.txt @@ -0,0 +1,14 @@ +ff_add_test_executable( + NAME + realm-backend-tests + SRC_PATTERNS + src/*.cc + PRIVATE_INCLUDE + src/ + DEPS + doctest + utils-test-common + realm-backend + kernels + op-attrs +) From 411017d25dd01e1222d8b9bbff5dd8a441ec745e Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 21 Jan 2025 17:45:26 -0800 Subject: [PATCH 27/91] Build local exec --- .../local-execution/task_argument_accessor.h | 40 +++++++++---------- .../src/local_cost_estimator.cc | 2 - .../src/local_training_backing.cc | 6 +-- 3 files changed, 23 insertions(+), 25 deletions(-) diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h index 29d5fb8fbe..8b8516045d 100644 --- a/lib/local-execution/include/local-execution/task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/task_argument_accessor.h @@ -59,16 +59,16 @@ struct TaskArgumentAccessor { this->ptr->get_tensor(slot, PRIV, TensorType::OPTIMIZER)); } - template - privilege_mode_to_accessor get_non_graph_tensor(int slot) const { - return this->get_tensor_grad(slot_id_t{slot}); - } + // template + // privilege_mode_to_accessor get_non_graph_tensor(int slot) const { + // return this->get_tensor_grad(slot_id_t{slot}); + // } - template - privilege_mode_to_accessor get_non_graph_tensor(slot_id_t slot) const { - return std::get>( - this->ptr->get_tensor(slot, PRIV, TensorType::NON_GRAPH)); - } + // template + // privilege_mode_to_accessor get_non_graph_tensor(slot_id_t slot) const { + // return std::get>( + // this->ptr->get_tensor(slot, PRIV, TensorType::NON_GRAPH)); + // } // variadic tensors template @@ -110,18 +110,18 @@ struct TaskArgumentAccessor { this->ptr->get_variadic_tensor(slot, PRIV, TensorType::OPTIMIZER)); } - template - std::vector> - get_variadic_non_graph_tensor(int slot) const { - return this->get_variadic_tensor_grad(slot_id_t{slot}); - } + // template + // std::vector> + // get_variadic_non_graph_tensor(int slot) const { + // return this->get_variadic_tensor_grad(slot_id_t{slot}); + // } - template - std::vector> - get_variadic_non_graph_tensor(slot_id_t slot) const { - return std::get>>( - this->ptr->get_variadic_tensor(slot, PRIV, TensorType::NON_GRAPH)); - } + // template + // std::vector> + // get_variadic_non_graph_tensor(slot_id_t slot) const { + // return std::get>>( + // this->ptr->get_variadic_tensor(slot, PRIV, TensorType::NON_GRAPH)); + // } Allocator get_allocator() const { return this->ptr->get_allocator(); diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc index 404064b7ce..b416378e66 100644 --- a/lib/local-execution/src/local_cost_estimator.cc +++ b/lib/local-execution/src/local_cost_estimator.cc @@ -66,8 +66,6 @@ CostDetails LocalCostEstimator::estimate_cost( LocalTrainingBacking local_backing(allocator, cg_builder.computation_graph, - LayerTensorBackingMap{}, - TensorBackingMap{}, this->runtime_arg_config); local_backing.register_and_allocate_layer(layer_added_result.layer); local_backing.execute_init(layer_added_result.layer); diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index 9b933dee9c..22dc3b8397 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -76,7 +76,7 @@ void LocalTrainingBacking::execute_init(layer_guid_t const &operator_node) { ComputationGraphOpAttrs attrs = get_layer_attrs(this->computation_graph, operator_node).attrs; - TaskInvocation invocation = this->lower_to_task_invocation(init(attrs)); + TaskInvocation invocation = this->lower_to_task_invocation(init(attrs), operator_node); TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation); DeviceSpecificDeviceStates device_state = @@ -93,7 +93,7 @@ std::optional ComputationGraphOpAttrs attrs = get_layer_attrs(this->computation_graph, operator_node).attrs; - TaskInvocation invocation = this->lower_to_task_invocation(forward(attrs)); + TaskInvocation invocation = this->lower_to_task_invocation(forward(attrs), operator_node); TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation); return this->call_task_impl(invocation.task_id, accessor); @@ -122,7 +122,7 @@ std::optional ComputationGraphOpAttrs attrs = get_layer_attrs(this->computation_graph, operator_node).attrs; - TaskInvocation invocation = this->lower_to_task_invocation(backward(attrs)); + TaskInvocation invocation = this->lower_to_task_invocation(backward(attrs), operator_node); TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation); return this->call_task_impl(invocation.task_id, accessor); From bcd1408de85562e3bc2f9aea1427c38f5c4eeffd Mon Sep 17 00:00:00 2001 From: fruitea Date: Wed, 22 Jan 2025 01:25:52 -0800 Subject: [PATCH 28/91] chore: duplicate some files from local-execution --- .proj.toml | 1 + lib/CMakeLists.txt | 1 + lib/realm-backend/CMakeLists.txt | 2 +- .../realm-backend/realm_args_backing.h | 37 ++++ .../realm_task_argument_accessor.h | 55 +++++ .../realm-backend/realm_tensor_backing.h | 58 +++++ .../realm-backend/realm_training_backing.h | 58 +++++ .../src/realm_training_backing.cc | 209 ++++++++++++++++++ 8 files changed, 420 insertions(+), 1 deletion(-) create mode 100644 lib/realm-backend/include/realm-backend/realm_args_backing.h create mode 100644 lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h create mode 100644 lib/realm-backend/include/realm-backend/realm_tensor_backing.h create mode 100644 lib/realm-backend/include/realm-backend/realm_training_backing.h create mode 100644 lib/realm-backend/src/realm_training_backing.cc diff --git a/.proj.toml b/.proj.toml index c895fcecc4..c1612ce918 100644 --- a/.proj.toml +++ b/.proj.toml @@ -27,6 +27,7 @@ test_targets = [ "compiler-tests", "substitution-generator-tests", "local-execution-tests", + #"realm-backend-tests", "models-tests", ] diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 972c656126..136bb29528 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -4,6 +4,7 @@ add_subdirectory(runtime) add_subdirectory(op-attrs) add_subdirectory(kernels) add_subdirectory(local-execution) +add_subdirectory(realm-backend) add_subdirectory(utils) add_subdirectory(ffi) add_subdirectory(substitutions) diff --git a/lib/realm-backend/CMakeLists.txt b/lib/realm-backend/CMakeLists.txt index 623816567e..436d8cc8b0 100644 --- a/lib/realm-backend/CMakeLists.txt +++ b/lib/realm-backend/CMakeLists.txt @@ -17,4 +17,4 @@ ff_add_library( legion ) -add_subdirectory(test) +# add_subdirectory(test) diff --git a/lib/realm-backend/include/realm-backend/realm_args_backing.h b/lib/realm-backend/include/realm-backend/realm_args_backing.h new file mode 100644 index 0000000000..626698cba6 --- /dev/null +++ b/lib/realm-backend/include/realm-backend/realm_args_backing.h @@ -0,0 +1,37 @@ +#ifndef _FLEXFLOW_REALM_BACKEND_REALM_ARGS_BACKING_H +#define _FLEXFLOW_REALM_BACKEND_REALM_ARGS_BACKING_H + +#include "pcg/layer_guid_t.dtg.h" +#include "pcg/computation_graph.h" +#include "local-execution/per_device_op_state.h" +#include "local-execution/op_task_invocation.h" +#include "local-execution/runtime_arg_config.h" +#include "local-execution/task_invocation.dtg.h" +#include "realm-backend/realm_task_argument_accessor.h" + +namespace FlexFlow { + +struct LocalArgsBacking { + LocalArgsBacking(RuntimeArgConfig const &); + +public: + void add_per_device_op_state(layer_guid_t const &, + DeviceSpecificDeviceStates const &); + + ArgSlotsBacking construct_arg_slots_backing(TaskBinding const &) const; + + ConcreteArgSpec lower_to_concrete_arg_spec(RuntimeArgRefSpec const &) const; + ConcreteArgSpec lower_to_concrete_arg_spec(OpArgRefSpec const &, + ComputationGraph const &, + layer_guid_t const &) const; + +public: + // arguments + std::unordered_map + per_device_op_states; + RuntimeArgConfig runtime_arg_config; +}; + +} + +#endif diff --git a/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h b/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h new file mode 100644 index 0000000000..ca4bc9db02 --- /dev/null +++ b/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h @@ -0,0 +1,55 @@ +#ifndef _FLEXFLOW_REALM_BACKEND_REALM_TASK_ARGUMENT_ACCESSOR_H +#define _FLEXFLOW_REALM_BACKEND_REALM_TASK_ARGUMENT_ACCESSOR_H + +#include "local-execution/slot_tensor_type_id.dtg.h" +#include "local-execution/task_argument_accessor.h" +#include +#include + +namespace FlexFlow { + +using TensorSlotsBacking = std::unordered_map< + SlotTensorTypeId, + std::variant>>; +using ArgSlotsBacking = std::unordered_map; + +struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor { + LocalTaskArgumentAccessor(Allocator const &allocator, + TensorSlotsBacking const &tensor_slots_backing, + ArgSlotsBacking const &arg_slots_backing); + + LocalTaskArgumentAccessor(LocalTaskArgumentAccessor const &) = delete; + LocalTaskArgumentAccessor(LocalTaskArgumentAccessor &&) = delete; + + ConcreteArgSpec const &get_concrete_arg(slot_id_t) const override; + + GenericTensorAccessor get_tensor(slot_id_t slot, + Permissions priv, + TensorType tensor_type) const override; + VariadicGenericTensorAccessor get_variadic_tensor( + slot_id_t slot, Permissions priv, TensorType tensor_type) const override; + + Allocator get_allocator() const override; + + size_t get_device_idx() const override; + +private: + Allocator allocator; + TensorSlotsBacking tensor_slots_backing; + ArgSlotsBacking arg_slots_backing; +}; + +using TensorSlotsBackingWithoutAddresses = std::unordered_map< + SlotTensorTypeId, + std::variant, + std::vector>>>; + +TensorSlotsBackingWithoutAddresses + get_slots_backing_without_tensor_allocation_addresses( + TensorSlotsBacking const &); + +CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalTaskArgumentAccessor); + +} // namespace FlexFlow + +#endif diff --git a/lib/realm-backend/include/realm-backend/realm_tensor_backing.h b/lib/realm-backend/include/realm-backend/realm_tensor_backing.h new file mode 100644 index 0000000000..2d9fa0bbdf --- /dev/null +++ b/lib/realm-backend/include/realm-backend/realm_tensor_backing.h @@ -0,0 +1,58 @@ + +#ifndef _FLEXFLOW_REALM_BACKEND_REALM_TENSOR_BACKING_H +#define _FLEXFLOW_REALM_BACKEND_REALM_TENSOR_BACKING_H + +#include "kernels/accessor.h" +#include "realm-backend/realm_task_argument_accessor.h" +#include "local-execution/task_invocation.dtg.h" +#include "local-execution/tensor_role.dtg.h" +#include "local-execution/lowered_tensor_t.dtg.h" +#include "local-execution/lowered_tensor_source.h" +#include "local-execution/optimizer_tensor_t.dtg.h" +#include "local-execution/loss_tensor_t.dtg.h" +#include "pcg/computation_graph.dtg.h" +#include "pcg/tensor_guid_t.dtg.h" +#include "pcg/layer_guid_t.dtg.h" + +namespace FlexFlow { + +using TensorBackingMap = + std::unordered_map; + +struct LocalTensorBacking { + LocalTensorBacking(); + +public: + void allocate_layer_tensors(layer_guid_t const &, + ComputationGraph const &, + Allocator &); + void allocate_tensors_by_role(TensorRole const &, + layer_guid_t const &, + ComputationGraph const &, + Allocator &); + void allocate_optimizer_tensors(tensor_guid_t const &, + std::vector const &, + Allocator &); + TensorSlotsBacking + construct_tensor_slots_backing(TaskBinding const &) const; + + GenericTensorAccessorW const & + get_tensor_backing(lowered_tensor_t const &) const; + + bool is_tensor_allocated(lowered_tensor_t const &) const; + +public: + // tensors + TensorBackingMap tensor_backings; + + std::unordered_map tensor_lowering_mapping; + std::unordered_map gradient_tensor_lowering_mapping; + std::unordered_map optimizer_tensor_lowering_mapping; + std::unordered_map loss_tensor_lowering_mapping; + + LoweredTensorSource lowered_tensor_source; +}; + +} // namespace FlexFlow + +#endif diff --git a/lib/realm-backend/include/realm-backend/realm_training_backing.h b/lib/realm-backend/include/realm-backend/realm_training_backing.h new file mode 100644 index 0000000000..e5385a93c3 --- /dev/null +++ b/lib/realm-backend/include/realm-backend/realm_training_backing.h @@ -0,0 +1,58 @@ +#ifndef _FLEXFLOW_REALM_BACKEND_REALM_TRAINING_BACKING_H +#define _FLEXFLOW_REALM_BACKEND_REALM_TRAINING_BACKING_H + +#include "realm-backend/realm_tensor_backing.h" +#include "realm-backend/realm_args_backing.h" +#include "local-execution/task_registry.h" +#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" +#include "pcg/computation_graph.dtg.h" +#include "pcg/optimizer_attrs.dtg.h" +#include "local-execution/optimizer_tensor_source.h" + +namespace FlexFlow { + +using PerLayerElapsedTime = + std::unordered_map>; + +struct LocalTrainingBacking { + LocalTrainingBacking(Allocator const &, + ComputationGraph const &, + RuntimeArgConfig const &); + void register_and_allocate_layer(layer_guid_t const &); + void allocate_layer_optimizer_tensors(layer_guid_t const &, + OptimizerAttrs const &); + + void execute_init(layer_guid_t const &); + std::optional execute_forward(layer_guid_t const &); + void compute_loss(LossAttrs const &loss_attrs, + tensor_guid_t const &logit_tensor, + loss_tensor_t const &label_tensor); + std::optional execute_backward(layer_guid_t const &); + void execute_update(layer_guid_t const &, OptimizerAttrs const &); + + TaskArgumentAccessor + get_task_arg_accessor(TaskInvocation const &) const; + + TaskInvocation lower_to_task_invocation(OpTaskInvocation const &, layer_guid_t const &) const; + + LocalTensorBacking local_tensor_backing; + LocalArgsBacking local_args_backing; + +private: + DeviceSpecificDeviceStates call_init_task_impl(task_id_t, + TaskArgumentAccessor const &); + std::optional call_task_impl(task_id_t, TaskArgumentAccessor); + +private: + Allocator allocator; + ComputationGraph computation_graph; + TaskRegistry task_registry; + + // optimizer + OptimizerTensorSource optimizer_tensor_source; + std::unordered_map> layer_optimizer_tensor_ids; +}; + +} // namespace FlexFlow + +#endif diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc new file mode 100644 index 0000000000..46efb17bc1 --- /dev/null +++ b/lib/realm-backend/src/realm_training_backing.cc @@ -0,0 +1,209 @@ +#include "realm-backend/realm_training_backing.h" +#include "local-execution/loss_functions.h" +#include "local-execution/optimizer.h" +#include "local-execution/task_invocation.h" +#include "local-execution/task_signature_impl.h" +#include "local-execution/tensor_lowering.h" +#include "pcg/computation_graph.h" +#include "pcg/optimizer_attrs.h" +#include "utils/containers/contains.h" +#include "utils/containers/contains_key.h" +#include "utils/containers/get_only.h" +#include "utils/containers/values.h" +#include "utils/exception.h" + +namespace FlexFlow { + +LocalTrainingBacking::LocalTrainingBacking( + Allocator const &allocator, + ComputationGraph const &computation_graph, + RuntimeArgConfig const &runtime_arg_config) + : allocator(allocator), computation_graph(computation_graph), + local_args_backing(runtime_arg_config), + task_registry(empty_task_registry()) {}; + +void LocalTrainingBacking::register_and_allocate_layer( + layer_guid_t const &node) { + ComputationGraphOpAttrs attrs = + get_layer_attrs(this->computation_graph, node).attrs; + this->local_tensor_backing.allocate_layer_tensors( + node, this->computation_graph, this->allocator); + register_tasks_for_layer(this->task_registry, node, attrs); +} + +void LocalTrainingBacking::allocate_layer_optimizer_tensors( + layer_guid_t const &node, OptimizerAttrs const &optimizer_attrs) { + ComputationGraphOpAttrs attrs = + get_layer_attrs(this->computation_graph, node).attrs; + if (attrs.has()) { + TaskSignature sig = get_update_signature(optimizer_attrs); + tensor_guid_t weight_tensor = + get_only(get_outgoing_tensors(this->computation_graph, node)); + + std::vector optimizer_tensors; + for (TensorTypeSlotSpec const & tensor_type_slot_spec: values(sig.tensor_guid_slots)) { + optimizer_tensors.push_back(this->optimizer_tensor_source.new_optimizer_tensor()); + } + this->layer_optimizer_tensor_ids.insert({node, optimizer_tensors}); + this->local_tensor_backing.allocate_optimizer_tensors( + weight_tensor, optimizer_tensors, this->allocator); + } +} + +DeviceSpecificDeviceStates + LocalTrainingBacking::call_init_task_impl(task_id_t task_id, + TaskArgumentAccessor const &acc) { + TaskSignatureAndImpl task_sig_impl = + this->task_registry.task_mapping.at(task_id); + auto fn = + task_sig_impl.impl_function.get().function_ptr; + return fn(acc); +} + +std::optional + LocalTrainingBacking::call_task_impl(task_id_t task_id, + TaskArgumentAccessor acc) { + TaskSignatureAndImpl task_sig_impl = + this->task_registry.task_mapping.at(task_id); + auto fn = + task_sig_impl.impl_function.get().function_ptr; + return fn(acc); +} + +void LocalTrainingBacking::execute_init(layer_guid_t const &operator_node) { + if (registry_contains_task_for_layer( + this->task_registry, operator_node, OpTaskType::INIT)) { + ComputationGraphOpAttrs attrs = + get_layer_attrs(this->computation_graph, operator_node).attrs; + + TaskInvocation invocation = this->lower_to_task_invocation(init(attrs)); + TaskArgumentAccessor accessor = + this->get_task_arg_accessor(invocation); + DeviceSpecificDeviceStates device_state = + this->call_init_task_impl(invocation.task_id, accessor); + this->local_args_backing.add_per_device_op_state(operator_node, + device_state); + } +} + +std::optional + LocalTrainingBacking::execute_forward(layer_guid_t const &operator_node) { + if (registry_contains_task_for_layer( + this->task_registry, operator_node, OpTaskType::FWD)) { + ComputationGraphOpAttrs attrs = + get_layer_attrs(this->computation_graph, operator_node).attrs; + + TaskInvocation invocation = this->lower_to_task_invocation(forward(attrs)); + TaskArgumentAccessor accessor = + this->get_task_arg_accessor(invocation); + return this->call_task_impl(invocation.task_id, accessor); + } else { + return std::nullopt; + } +} + +void LocalTrainingBacking::compute_loss(LossAttrs const &loss_attrs, + tensor_guid_t const &logit_tensor, + loss_tensor_t const &label_tensor) { + TaskInvocation loss_invocation = + backward(loss_attrs, logit_tensor, label_tensor); + // TODO: https://github.com/flexflow/flexflow-train/issues/1442 + // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation)); + TaskArgumentAccessor loss_accessor = + this->get_task_arg_accessor(loss_invocation); + TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl(); + loss_impl_fn.get().function_ptr(loss_accessor); +} + +std::optional + LocalTrainingBacking::execute_backward(layer_guid_t const &operator_node) { + if (registry_contains_task_for_layer( + this->task_registry, operator_node, OpTaskType::BWD)) { + ComputationGraphOpAttrs attrs = + get_layer_attrs(this->computation_graph, operator_node).attrs; + + TaskInvocation invocation = this->lower_to_task_invocation(backward(attrs)); + TaskArgumentAccessor accessor = + this->get_task_arg_accessor(invocation); + return this->call_task_impl(invocation.task_id, accessor); + } else { + return std::nullopt; + } +} + +void LocalTrainingBacking::execute_update( + layer_guid_t const &node, OptimizerAttrs const &optimizer_attrs) { + LayerAttrs layer_attrs = get_layer_attrs(this->computation_graph, node); + if (layer_attrs.attrs.has()) { + // get tensors + tensor_guid_t weight_tensor = get_only(get_outgoing_tensors(this->computation_graph, node)); + std::vector optimizer_buffer_tensors = this->layer_optimizer_tensor_ids.at(node); + + // get invocation + TaskInvocation invocation = get_update_invocation( + optimizer_attrs, weight_tensor, optimizer_buffer_tensors); + + // TODO: https://github.com/flexflow/flexflow-train/issues/1442 + // assert(is_invocation_valid(get_update_signature(attrs), invocation)); + + // execute update + TaskArgumentAccessor accessor = + this->get_task_arg_accessor(invocation); + TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs); + update_impl_fn.get().function_ptr(accessor); + } +} + +TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor( + TaskInvocation const &invocation) const { + TensorSlotsBacking tensor_slots_backing = + this->local_tensor_backing.construct_tensor_slots_backing( + invocation.binding); + ArgSlotsBacking arg_slots_backing = + this->local_args_backing.construct_arg_slots_backing(invocation.binding); + return TaskArgumentAccessor::create( + this->allocator, tensor_slots_backing, arg_slots_backing); +} + +TaskInvocation LocalTrainingBacking::lower_to_task_invocation(OpTaskInvocation const & op_task_invocation, layer_guid_t const & layer_guid) const { + TaskBinding binding; + // tensors + for (auto const & tensor_binding: op_task_invocation.binding.get_tensor_bindings()) { + tensor_guid_t tensor_to_bind = [&] { + switch (tensor_binding.second.role) { + case TensorRole::INPUT: + return get_incoming_inputs(this->computation_graph, layer_guid).at(tensor_binding.second.idx); + case TensorRole::OUTPUT: + return get_outgoing_tensors(this->computation_graph, layer_guid).at(tensor_binding.second.idx); + case TensorRole::WEIGHT: + return get_incoming_weights(this->computation_graph, layer_guid).at(tensor_binding.second.idx); + default: + throw mk_runtime_error(fmt::format("Invalid tensor role {}", tensor_binding.second.role)); + } + }(); + + if (tensor_binding.first.is_grad == IsGrad::NO) { + binding.bind(tensor_binding.first.slot_id, tensor_to_bind); + } else if (tensor_binding.first.is_grad == IsGrad::YES) { + binding.bind_grad(tensor_binding.first.slot_id, tensor_to_bind); + } else { + throw mk_runtime_error(fmt::format("Invalid value for IsGrad {}", tensor_binding.first.is_grad)); + } + } + + // args + for (auto const & arg_binding: op_task_invocation.binding.get_arg_bindings()) { + if (arg_binding.second.has()) { + ConcreteArgSpec concrete_arg = this->local_args_backing.lower_to_concrete_arg_spec(arg_binding.second.get(), this->computation_graph, layer_guid); + binding.insert_arg_spec(arg_binding.first, TaskArgSpec{concrete_arg}); + } else if (arg_binding.second.has()) { + binding.insert_arg_spec(arg_binding.first, TaskArgSpec{arg_binding.second.get()}); + } else { + binding.insert_arg_spec(arg_binding.first, TaskArgSpec{arg_binding.second.get()}); + } + } + + return TaskInvocation{op_task_invocation.task_id, binding}; +} + +} // namespace FlexFlow From 1c55cf7fa10de442ab1bbc0b6c11a04cebb76468 Mon Sep 17 00:00:00 2001 From: fruitea Date: Mon, 27 Jan 2025 22:31:38 -0800 Subject: [PATCH 29/91] Merge branch 'master' of github.com:flexflow/flexflow-train into realm-backend From b9144ad0b3107d75011a602634b5cfa05fe58a69 Mon Sep 17 00:00:00 2001 From: fruitea Date: Thu, 30 Jan 2025 06:19:34 -0800 Subject: [PATCH 30/91] chore: update legion --- .flake/pkgs/legion.nix | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.flake/pkgs/legion.nix b/.flake/pkgs/legion.nix index 814ef85e00..ebef22f4c5 100644 --- a/.flake/pkgs/legion.nix +++ b/.flake/pkgs/legion.nix @@ -18,13 +18,13 @@ in stdenv.mkDerivation rec { pname = "legion_flexflow"; - version = "2024-03-13"; + version = "2025-01-21"; src = fetchFromGitLab { owner = "StanfordLegion"; repo = "legion"; - rev = "24e8c452341dea41427e0ce61e154d61715e6835"; - sha256 = "sha256-NjCSjphOIew/V24i74I6DModSGcWKLeiSIjts3cFtx4="; + rev = "0c5a181e59c07e3af1091a2007378ff9355047fa"; + sha256 = "sha256-oapo7klN17gmRsmaSsrpup4YJ0dtHxiKFtwz8jyPqzU="; fetchSubmodules = true; }; @@ -33,7 +33,7 @@ stdenv.mkDerivation rec { ]; cmakeFlags = [ - "-DLegion_USE_Python=1" + "-DLegion_USE_Python=0" "-DLegion_BUILD_BINDINGS=1" "-DLegion_USE_CUDA=1" "-DLegion_CUDA_ARCH=${lib.concatStringsSep "," cudaCapabilities}" From 66647a26bccaecaa1a57852971d0b7a5735d32d4 Mon Sep 17 00:00:00 2001 From: fruitea Date: Thu, 30 Jan 2025 07:33:27 -0800 Subject: [PATCH 31/91] feat: add legion related code --- .../include/realm-backend/driver.h | 11 +++ .../include/realm-backend/task_wrapper.h | 39 ++++++++++ lib/realm-backend/src/driver.cc | 24 ++++++ lib/realm-backend/src/task_wrapper.cc | 73 +++++++++++++++++++ 4 files changed, 147 insertions(+) create mode 100644 lib/realm-backend/include/realm-backend/driver.h create mode 100644 lib/realm-backend/include/realm-backend/task_wrapper.h create mode 100644 lib/realm-backend/src/driver.cc create mode 100644 lib/realm-backend/src/task_wrapper.cc diff --git a/lib/realm-backend/include/realm-backend/driver.h b/lib/realm-backend/include/realm-backend/driver.h new file mode 100644 index 0000000000..77272c36ad --- /dev/null +++ b/lib/realm-backend/include/realm-backend/driver.h @@ -0,0 +1,11 @@ +#ifndef _FLEXFLOW_REALM_BACKEND_DRIVER_H +#define _FLEXFLOW_REALM_BACKEND_DRIVER_H + +#include "realm.h" +#include "realm/cmdline.h" +#include "local-execution/task_invocation.h" + +void top_level_task(const void *args, size_t arglen, const void *userdata, + size_t userlen, Realm::Processor p); + +#endif diff --git a/lib/realm-backend/include/realm-backend/task_wrapper.h b/lib/realm-backend/include/realm-backend/task_wrapper.h new file mode 100644 index 0000000000..bf53ca7e93 --- /dev/null +++ b/lib/realm-backend/include/realm-backend/task_wrapper.h @@ -0,0 +1,39 @@ +#ifndef _FLEXFLOW_REALM_BACKEND_TASK_WRAPPER_H +#define _FLEXFLOW_REALM_BACKEND_TASK_WRAPPER_H + +#include "local-execution/task_registry.h" +#include "realm-backend/driver.h" +#include "realm-backend/realm_task_argument_accessor.h" + +namespace FlexFlow { + +/* The following are general task wrappers to be invoked by the Realm runtime */ + +struct RealmTaskArgs { + task_id_t task_id; + TaskImplFunction impl_function; + TaskArgumentAccessor accessor; + void *result; +}; + +void init_wrapper_task(const void *args, size_t arglen, const void *userdata, + size_t userlen, Realm::Processor p); + +void fwdbwd_wrapper_task(const void *args, size_t arglen, const void *userdata, + size_t userlen, Realm::Processor p); + +void generic_wrapper_task(const void *args, size_t arglen, const void *userdata, + size_t userlen, Realm::Processor p); + +void register_wrapper_tasks_init(Realm::Processor p, task_id_t task_id); + +void register_wrapper_tasks_fwdbwd(Realm::Processor p, task_id_t task_id); + +void register_wrapper_tasks_generic(Realm::Processor p, task_id_t task_id); + +void register_wrapper_tasks(Realm::Processor p, task_id_t task_id, + TaskSignatureAndImpl task_sig_impl); + +} // namespace FlexFlow + +#endif \ No newline at end of file diff --git a/lib/realm-backend/src/driver.cc b/lib/realm-backend/src/driver.cc new file mode 100644 index 0000000000..8cfb038d97 --- /dev/null +++ b/lib/realm-backend/src/driver.cc @@ -0,0 +1,24 @@ +#include "realm-backend/driver.h" + +using namespace Realm; +using namespace FlexFlow; + +Logger log_app("app"); + +int main(int argc, const char **argv) { + Runtime rt; + rt.init(&argc, (char ***)&argv); + + Processor::register_task_by_kind(Processor::LOC_PROC, false /*!global*/, + static_cast(task_id_t::TOP_LEVEL_TASK_ID), + CodeDescriptor(top_level_task), + ProfilingRequestSet()) + .external_wait(); + + Processor p = Machine::ProcessorQuery(Machine::get_machine()) + .only_kind(Processor::LOC_PROC) + .first(); + + rt.shutdown(rt.collective_spawn(p, static_cast(task_id_t::TOP_LEVEL_TASK_ID), 0, 0)); + return rt.wait_for_shutdown(); +} diff --git a/lib/realm-backend/src/task_wrapper.cc b/lib/realm-backend/src/task_wrapper.cc new file mode 100644 index 0000000000..7361a24cd9 --- /dev/null +++ b/lib/realm-backend/src/task_wrapper.cc @@ -0,0 +1,73 @@ +#include "realm-backend/task_wrapper.h" + +namespace FlexFlow { + +using namespace Realm; + +void init_wrapper_task(const void *args, size_t arglen, const void *userdata, + size_t userlen, Processor p) { + RealmTaskArgs const &task_args = + *reinterpret_cast(args); + auto fn = + RealmTaskArgs.impl_function.get().function_ptr; + *reinterpret_cast(RealmTaskArgs.result) = + fn(RealmTaskArgs.acc); +} + +void fwdbwd_wrapper_task(const void *args, size_t arglen, const void *userdata, + size_t userlen, Processor p) { + RealmTaskArgs const &task_args = + *reinterpret_cast(args); + auto fn = + RealmTaskArgs.impl_function.get().function_ptr; + *reinterpret_cast *>(RealmTaskArgs.result) = + fn(RealmTaskArgs.acc); +} + +void generic_wrapper_task(const void *args, size_t arglen, const void *userdata, + size_t userlen, Processor p) { + RealmTaskArgs const &task_args = + *reinterpret_cast(args); + auto fn = + RealmTaskArgs.impl_function.get().function_ptr; + fn(RealmTaskArgs.acc); +} + +void register_wrapper_tasks_init(Processor p, task_id_t task_id) { + Processor::register_task_by_kind( + p.kind(), false /*!global*/, static_cast(task_id), + CodeDescriptor(init_wrapper_task), ProfilingRequestSet()) + .external_wait(); +} + +void register_wrapper_tasks_fwdbwd(Realm::Processor p, task_id_t task_id) { + Processor::register_task_by_kind( + p.kind(), false /*!global*/, static_cast(task_id), + CodeDescriptor(fwdbwd_wrapper_task), ProfilingRequestSet()) + .external_wait(); +} + +void register_wrapper_tasks_generic(Realm::Processor p, task_id_t task_id) { + Processor::register_task_by_kind( + p.kind(), false /*!global*/, static_cast(task_id), + CodeDescriptor(generic_wrapper_task), ProfilingRequestSet()) + .external_wait(); +} + +void register_wrapper_tasks(Processor p, task_id_t task_id, + TaskSignatureAndImpl task_sig_impl) { + switch (task_sig_impl.task_signature.type) { + case OpTaskType::INIT: + register_wrapper_tasks_init(p, task_id); + break; + case OpTaskType::FWD: + case OpTaskType::BWD: + register_wrapper_tasks_fwdbwd(p, task_id); + break; + default: + register_wrapper_tasks_generic(p, task_id); + break; + } +} + +} // namespace FlexFlow \ No newline at end of file From 0128abb7f0b76d9a985a5aded5195643a0ef0c1e Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Fri, 31 Jan 2025 18:26:33 -0800 Subject: [PATCH 32/91] Disaggregate local backend --- lib/kernels/src/array_shape.cc | 18 +- .../local-execution/gradient_tensor_source.h | 20 ++ .../gradient_tensor_t.struct.toml | 13 + .../local-execution/local_args_backing.h | 32 +- .../local-execution/local_tensor_backing.h | 95 ++++-- .../local-execution/local_training_backing.h | 64 ++-- .../local-execution/loss_tensor_source.h | 20 ++ .../local-execution/lowered_tensor_source.h | 3 +- .../local-execution/model_training_instance.h | 27 +- .../op_task_to_task_invocation.h | 30 ++ .../local-execution/optimizer_tensor_source.h | 3 +- .../local-execution/task_argument_accessor.h | 3 +- .../include/local-execution/task_binding.h | 8 +- .../include/local-execution/task_registry.h | 7 +- .../tensor_type_t.variant.toml | 5 + .../src/gradient_tensor_source.cc | 14 + lib/local-execution/src/local_args_backing.cc | 67 ++--- .../src/local_cost_estimator.cc | 9 +- .../src/local_tensor_backing.cc | 280 +++++++++++++----- .../src/local_training_backing.cc | 238 +++++++-------- lib/local-execution/src/loss_functions.cc | 5 +- lib/local-execution/src/loss_tensor_source.cc | 13 + .../src/lowered_tensor_source.cc | 5 +- .../src/model_training_instance.cc | 74 +++-- .../src/{local-execution => }/op_arg_spec.cc | 0 lib/local-execution/src/op_task_invocation.cc | 10 +- lib/local-execution/src/op_task_signature.cc | 32 +- .../src/op_task_to_task_invocation.cc | 108 +++++++ .../src/optimizer_tensor_source.cc | 5 +- lib/local-execution/src/task_binding.cc | 52 ++-- lib/local-execution/src/task_registry.cc | 13 +- .../test/src/test_local_slots_backing.cc | 118 ++++---- .../test/src/test_local_task_arg_accessor.cc | 45 +-- lib/local-execution/test/src/test_loss_e2e.cc | 24 +- .../test/src/test_update_e2e.cc | 2 +- .../include/op-attrs/operator_attrs.h | 2 +- lib/pcg/include/pcg/computation_graph.h | 2 + lib/pcg/include/pcg/optimizer_attrs.h | 1 + lib/pcg/src/pcg/computation_graph.cc | 5 + lib/pcg/src/pcg/optimizer_attrs.cc | 13 + 40 files changed, 938 insertions(+), 547 deletions(-) create mode 100644 lib/local-execution/include/local-execution/gradient_tensor_source.h create mode 100644 lib/local-execution/include/local-execution/gradient_tensor_t.struct.toml create mode 100644 lib/local-execution/include/local-execution/loss_tensor_source.h create mode 100644 lib/local-execution/include/local-execution/op_task_to_task_invocation.h create mode 100644 lib/local-execution/src/gradient_tensor_source.cc create mode 100644 lib/local-execution/src/loss_tensor_source.cc rename lib/local-execution/src/{local-execution => }/op_arg_spec.cc (100%) create mode 100644 lib/local-execution/src/op_task_to_task_invocation.cc diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index eb2b88b203..e8685048c6 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -57,15 +57,15 @@ std::size_t ArrayShape::at(ff_dim_t idx) const { ArrayShape ArrayShape::sub_shape(std::optional start, std::optional end) const { - std::optional legion_start = transform( - start, [&](auto const &start_unwrapped) { - return legion_dim_from_ff_dim(start_unwrapped, num_dims()); - }); - - std::optional legion_end = transform( - end, [&](auto const &end_unwrapped) { - return legion_dim_from_ff_dim(end_unwrapped, num_dims()); - }); + std::optional legion_start = + transform(start, [&](auto const &start_unwrapped) { + return legion_dim_from_ff_dim(start_unwrapped, num_dims()); + }); + + std::optional legion_end = + transform(end, [&](auto const &end_unwrapped) { + return legion_dim_from_ff_dim(end_unwrapped, num_dims()); + }); return this->sub_shape(legion_start, legion_end); } diff --git a/lib/local-execution/include/local-execution/gradient_tensor_source.h b/lib/local-execution/include/local-execution/gradient_tensor_source.h new file mode 100644 index 0000000000..bb7a4c7aa8 --- /dev/null +++ b/lib/local-execution/include/local-execution/gradient_tensor_source.h @@ -0,0 +1,20 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_GRADIENT_TENSOR_SOURCE_H +#define _FLEXFLOW_LOCAL_EXECUTION_GRADIENT_TENSOR_SOURCE_H + +#include "local-execution/gradient_tensor_t.dtg.h" + +namespace FlexFlow { + +struct GradientTensorSource { +public: + GradientTensorSource(); + + gradient_tensor_t new_gradient_tensor(); + +private: + static size_t next_available_gradient_tensor_id; +}; + +} // namespace FlexFlow + +#endif diff --git a/lib/local-execution/include/local-execution/gradient_tensor_t.struct.toml b/lib/local-execution/include/local-execution/gradient_tensor_t.struct.toml new file mode 100644 index 0000000000..5367ccee07 --- /dev/null +++ b/lib/local-execution/include/local-execution/gradient_tensor_t.struct.toml @@ -0,0 +1,13 @@ +namespace = "FlexFlow" +name = "gradient_tensor_t" +features = [ + "eq", + "ord", + "hash", + "fmt", +] + + +[[fields]] +name = "raw_index" +type = "int" diff --git a/lib/local-execution/include/local-execution/local_args_backing.h b/lib/local-execution/include/local-execution/local_args_backing.h index d497c49738..6e6839fea7 100644 --- a/lib/local-execution/include/local-execution/local_args_backing.h +++ b/lib/local-execution/include/local-execution/local_args_backing.h @@ -1,30 +1,19 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_ARGS_BACKING_H #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_ARGS_BACKING_H -#include "pcg/layer_guid_t.dtg.h" -#include "pcg/computation_graph.h" -#include "local-execution/per_device_op_state.h" +#include "local-execution/local_task_argument_accessor.h" #include "local-execution/op_task_invocation.h" +#include "local-execution/per_device_op_state.h" #include "local-execution/runtime_arg_config.h" #include "local-execution/task_invocation.dtg.h" -#include "local-execution/local_task_argument_accessor.h" +#include "pcg/computation_graph.h" +#include "pcg/layer_guid_t.dtg.h" namespace FlexFlow { struct LocalArgsBacking { LocalArgsBacking(RuntimeArgConfig const &); -public: - void add_per_device_op_state(layer_guid_t const &, - DeviceSpecificDeviceStates const &); - - ArgSlotsBacking construct_arg_slots_backing(TaskBinding const &) const; - - ConcreteArgSpec lower_to_concrete_arg_spec(RuntimeArgRefSpec const &) const; - ConcreteArgSpec lower_to_concrete_arg_spec(OpArgRefSpec const &, - ComputationGraph const &, - layer_guid_t const &) const; - public: // arguments std::unordered_map @@ -32,6 +21,17 @@ struct LocalArgsBacking { RuntimeArgConfig runtime_arg_config; }; -} +void add_per_device_op_state(LocalArgsBacking &, + layer_guid_t const &, + DeviceSpecificDeviceStates const &); + +std::optional + get_per_device_op_state_if_exists(LocalArgsBacking const &, + layer_guid_t const &); + +ArgSlotsBacking construct_arg_slots_backing(TaskBinding const &, + RuntimeArgConfig const &); + +} // namespace FlexFlow #endif diff --git a/lib/local-execution/include/local-execution/local_tensor_backing.h b/lib/local-execution/include/local-execution/local_tensor_backing.h index 68a38253f8..825ff0553e 100644 --- a/lib/local-execution/include/local-execution/local_tensor_backing.h +++ b/lib/local-execution/include/local-execution/local_tensor_backing.h @@ -3,16 +3,22 @@ #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TENSOR_BACKING_H #include "kernels/accessor.h" +#include "local-execution/gradient_tensor_source.h" #include "local-execution/local_task_argument_accessor.h" -#include "local-execution/task_invocation.dtg.h" -#include "local-execution/tensor_role.dtg.h" -#include "local-execution/lowered_tensor_t.dtg.h" +#include "local-execution/loss_tensor_source.h" +#include "local-execution/loss_tensor_t.dtg.h" #include "local-execution/lowered_tensor_source.h" +#include "local-execution/lowered_tensor_t.dtg.h" +#include "local-execution/optimizer_tensor_source.h" #include "local-execution/optimizer_tensor_t.dtg.h" -#include "local-execution/loss_tensor_t.dtg.h" +#include "local-execution/task_invocation.dtg.h" +#include "local-execution/tensor_role.dtg.h" +#include "local-execution/tensor_type_t.dtg.h" +#include "op-attrs/tensor_shape.dtg.h" #include "pcg/computation_graph.dtg.h" -#include "pcg/tensor_guid_t.dtg.h" #include "pcg/layer_guid_t.dtg.h" +#include "pcg/optimizer_attrs.dtg.h" +#include "pcg/tensor_guid_t.dtg.h" namespace FlexFlow { @@ -20,39 +26,72 @@ using TensorBackingMap = std::unordered_map; struct LocalTensorBacking { - LocalTensorBacking(); - -public: - void allocate_layer_tensors(layer_guid_t const &, - ComputationGraph const &, - Allocator &); - void allocate_tensors_by_role(TensorRole const &, - layer_guid_t const &, - ComputationGraph const &, - Allocator &); - void allocate_optimizer_tensors(tensor_guid_t const &, - std::vector const &, - Allocator &); - TensorSlotsBacking - construct_tensor_slots_backing(TaskBinding const &) const; + LocalTensorBacking() = default; + LocalTensorBacking( + std::unordered_map const + &allocated_tensor_backings, + std::unordered_set const &allocated_tensor_guids, + std::unordered_map const + &allocated_gradient_mapping, + std::unordered_map> const + &allocated_optimizer_mapping, + std::unordered_set const &allocated_loss_tensors); - GenericTensorAccessorW const & - get_tensor_backing(lowered_tensor_t const &) const; - - bool is_tensor_allocated(lowered_tensor_t const &) const; + lowered_tensor_t allocate_tensor(TensorShape const &, Allocator &); public: // tensors TensorBackingMap tensor_backings; - + std::unordered_map tensor_lowering_mapping; - std::unordered_map gradient_tensor_lowering_mapping; - std::unordered_map optimizer_tensor_lowering_mapping; - std::unordered_map loss_tensor_lowering_mapping; + std::unordered_map + gradient_tensor_lowering_mapping; + std::unordered_map + optimizer_tensor_lowering_mapping; + std::unordered_map + loss_tensor_lowering_mapping; + + std::unordered_map tensor_gradient_mapping; + std::unordered_map> + tensor_optimizer_mapping; +private: + lowered_tensor_t insert_tensor(GenericTensorAccessorW const &); LoweredTensorSource lowered_tensor_source; }; +void allocate_tensor_guid(LocalTensorBacking &, + tensor_guid_t const &, + TensorShape const &, + Allocator &); +void allocate_gradient_tensor(LocalTensorBacking &, + gradient_tensor_t const &, + tensor_guid_t const &, + TensorShape const &, + Allocator &); +void allocate_optimizer_tensors(LocalTensorBacking &, + std::vector const &, + tensor_guid_t const &, + TensorShape const &, + Allocator &); + +void allocate_all_computation_graph_tensors(LocalTensorBacking &, + GradientTensorSource &, + ComputationGraph const &, + Allocator &); +void allocate_all_optimizer_tensors(LocalTensorBacking &, + OptimizerTensorSource &, + ComputationGraph const &, + Allocator &, + OptimizerAttrs const &); +loss_tensor_t allocate_loss_tensor(LocalTensorBacking &, + LossTensorSource const &, + TensorShape const &, + Allocator &); + +TensorSlotsBacking construct_tensor_slots_backing(LocalTensorBacking const &, + TaskBinding const &); + } // namespace FlexFlow #endif diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h index a915f3e420..b712be9a93 100644 --- a/lib/local-execution/include/local-execution/local_training_backing.h +++ b/lib/local-execution/include/local-execution/local_training_backing.h @@ -1,58 +1,60 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TRAINING_BACKING_H #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TRAINING_BACKING_H -#include "local-execution/local_tensor_backing.h" #include "local-execution/local_args_backing.h" +#include "local-execution/local_tensor_backing.h" +#include "local-execution/optimizer_tensor_source.h" #include "local-execution/task_registry.h" #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" #include "pcg/computation_graph.dtg.h" #include "pcg/optimizer_attrs.dtg.h" -#include "local-execution/optimizer_tensor_source.h" namespace FlexFlow { -using PerLayerElapsedTime = - std::unordered_map>; - struct LocalTrainingBacking { LocalTrainingBacking(Allocator const &, ComputationGraph const &, - RuntimeArgConfig const &); - void register_and_allocate_layer(layer_guid_t const &); - void allocate_layer_optimizer_tensors(layer_guid_t const &, - OptimizerAttrs const &); - - void execute_init(layer_guid_t const &); - std::optional execute_forward(layer_guid_t const &); - void compute_loss(LossAttrs const &loss_attrs, - tensor_guid_t const &logit_tensor, - loss_tensor_t const &label_tensor); - std::optional execute_backward(layer_guid_t const &); - void execute_update(layer_guid_t const &, OptimizerAttrs const &); - - TaskArgumentAccessor - get_task_arg_accessor(TaskInvocation const &) const; - - TaskInvocation lower_to_task_invocation(OpTaskInvocation const &, layer_guid_t const &) const; + LocalTensorBacking const &, + LocalArgsBacking const &); +public: LocalTensorBacking local_tensor_backing; LocalArgsBacking local_args_backing; -private: - DeviceSpecificDeviceStates call_init_task_impl(task_id_t, - TaskArgumentAccessor const &); - std::optional call_task_impl(task_id_t, TaskArgumentAccessor); - -private: Allocator allocator; ComputationGraph computation_graph; TaskRegistry task_registry; - // optimizer - OptimizerTensorSource optimizer_tensor_source; - std::unordered_map> layer_optimizer_tensor_ids; +private: + GradientTensorSource gradient_tensor_source; }; +DeviceSpecificDeviceStates call_init_task_impl(TaskRegistry const &, + task_id_t task_id, + TaskArgumentAccessor const &acc); + +std::optional call_task_impl(TaskRegistry const &, + task_id_t task_id, + TaskArgumentAccessor acc); + +void execute_init(LocalTrainingBacking &, layer_guid_t const &); +std::optional execute_forward(LocalTrainingBacking &, + layer_guid_t const &); +std::optional execute_backward(LocalTrainingBacking &, + layer_guid_t const &); +void compute_loss(LocalTrainingBacking const &, + LossAttrs const &, + tensor_guid_t const &logit_tensor, + loss_tensor_t const &label_tensor); +void execute_update(LocalTrainingBacking &, + layer_guid_t const &, + OptimizerAttrs const &); + +TaskArgumentAccessor get_task_arg_accessor(LocalTensorBacking const &, + LocalArgsBacking const &, + TaskInvocation const &, + Allocator &); + } // namespace FlexFlow #endif diff --git a/lib/local-execution/include/local-execution/loss_tensor_source.h b/lib/local-execution/include/local-execution/loss_tensor_source.h new file mode 100644 index 0000000000..2b55f1af01 --- /dev/null +++ b/lib/local-execution/include/local-execution/loss_tensor_source.h @@ -0,0 +1,20 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_LOSS_TENSOR_SOURCE_H +#define _FLEXFLOW_LOCAL_EXECUTION_LOSS_TENSOR_SOURCE_H + +#include "local-execution/loss_tensor_t.dtg.h" + +namespace FlexFlow { + +struct LossTensorSource { +public: + LossTensorSource(); + + loss_tensor_t new_loss_tensor(); + +private: + static size_t next_available_loss_tensor_id; +}; + +} // namespace FlexFlow + +#endif diff --git a/lib/local-execution/include/local-execution/lowered_tensor_source.h b/lib/local-execution/include/local-execution/lowered_tensor_source.h index 63cc2cd31e..e4fc4ff56c 100644 --- a/lib/local-execution/include/local-execution/lowered_tensor_source.h +++ b/lib/local-execution/include/local-execution/lowered_tensor_source.h @@ -14,8 +14,7 @@ struct LoweredTensorSource { private: static size_t next_available_lowered_tensor_id; }; - -} // namespace FlexFlow +} // namespace FlexFlow #endif diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h index dd6a6f33d7..81aacf2a53 100644 --- a/lib/local-execution/include/local-execution/model_training_instance.h +++ b/lib/local-execution/include/local-execution/model_training_instance.h @@ -2,9 +2,9 @@ #define _FLEXFLOW_LOCAL_EXECUTION_MODEL_TRAINING_INSTANCE_H #include "local-execution/local_training_backing.h" +#include "local-execution/loss_tensor_t.dtg.h" #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" #include "pcg/tensor_guid_t.dtg.h" -#include "local-execution/loss_tensor_t.dtg.h" namespace FlexFlow { @@ -12,27 +12,28 @@ using PerLayerElapsedTime = std::unordered_map>; struct ModelTrainingInstance { - ModelTrainingInstance(Allocator const &, - ComputationGraph const &, - RuntimeArgConfig const &, + ModelTrainingInstance(LocalTrainingBacking const &, + tensor_guid_t const & logit_tensor, + TensorShape const & label_tensor_shape, LossAttrs const &, - tensor_guid_t const &logit_tensor, - loss_tensor_t const &label_tensor, OptimizerAttrs const &); - void execute_init(); - PerLayerElapsedTime execute_forward(); - PerLayerElapsedTime execute_backward(); - void execute_update(); - - ComputationGraph computation_graph; LocalTrainingBacking training_backing; LossAttrs loss_attrs; + OptimizerAttrs optimizer_attrs; tensor_guid_t logit_tensor; loss_tensor_t label_tensor; - OptimizerAttrs optimizer_attrs; + +private: + OptimizerTensorSource optimizer_tensor_source; + LossTensorSource loss_tensor_source; }; +void init(ModelTrainingInstance &); +PerLayerElapsedTime forward(ModelTrainingInstance &); +PerLayerElapsedTime backward(ModelTrainingInstance &); +void update(ModelTrainingInstance &); + } // namespace FlexFlow #endif diff --git a/lib/local-execution/include/local-execution/op_task_to_task_invocation.h b/lib/local-execution/include/local-execution/op_task_to_task_invocation.h new file mode 100644 index 0000000000..44e10d4b51 --- /dev/null +++ b/lib/local-execution/include/local-execution/op_task_to_task_invocation.h @@ -0,0 +1,30 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_TO_TASK_INVOCATION_H +#define _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_TO_TASK_INVOCATION_H + +#include "local-execution/device_specific_device_states.dtg.h" +#include "local-execution/op_task_invocation.h" +#include "local-execution/runtime_arg_config.h" +#include "local-execution/task_invocation.dtg.h" +#include "pcg/computation_graph.dtg.h" +#include "pcg/layer_guid_t.dtg.h" + +namespace FlexFlow { + +TaskInvocation + lower_to_task_invocation(OpTaskInvocation const &, + layer_guid_t const &, + ComputationGraph const &, + std::optional const &); + +ConcreteArgSpec lower_to_concrete_arg_spec(RuntimeArgRefSpec const &, + RuntimeArgConfig const &); + +ConcreteArgSpec lower_to_concrete_arg_spec( + OpArgRefSpec const &, + ComputationGraph const &, + layer_guid_t const &, + std::optional const &); + +} // namespace FlexFlow + +#endif diff --git a/lib/local-execution/include/local-execution/optimizer_tensor_source.h b/lib/local-execution/include/local-execution/optimizer_tensor_source.h index fc5015b299..658c545225 100644 --- a/lib/local-execution/include/local-execution/optimizer_tensor_source.h +++ b/lib/local-execution/include/local-execution/optimizer_tensor_source.h @@ -14,8 +14,7 @@ struct OptimizerTensorSource { private: static size_t next_available_optimizer_tensor_id; }; - -} // namespace FlexFlow +} // namespace FlexFlow #endif diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h index 8b8516045d..16a63a789b 100644 --- a/lib/local-execution/include/local-execution/task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/task_argument_accessor.h @@ -65,7 +65,8 @@ struct TaskArgumentAccessor { // } // template - // privilege_mode_to_accessor get_non_graph_tensor(slot_id_t slot) const { + // privilege_mode_to_accessor get_non_graph_tensor(slot_id_t slot) const + // { // return std::get>( // this->ptr->get_tensor(slot, PRIV, TensorType::NON_GRAPH)); // } diff --git a/lib/local-execution/include/local-execution/task_binding.h b/lib/local-execution/include/local-execution/task_binding.h index 33636616b3..21fc813a6b 100644 --- a/lib/local-execution/include/local-execution/task_binding.h +++ b/lib/local-execution/include/local-execution/task_binding.h @@ -1,14 +1,14 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_BINDING_H #define _FLEXFLOW_LOCAL_EXECUTION_TASK_BINDING_H +#include "local-execution/loss_tensor_t.dtg.h" #include "local-execution/lowered_tensor_t.dtg.h" +#include "local-execution/optimizer_tensor_t.dtg.h" #include "local-execution/slot_id_t.dtg.h" #include "local-execution/slot_tensor_type_id.dtg.h" #include "local-execution/task_arg_spec.dtg.h" #include "local-execution/task_id_t.dtg.h" #include "local-execution/task_signature.dtg.h" -#include "local-execution/optimizer_tensor_t.dtg.h" -#include "local-execution/loss_tensor_t.dtg.h" #include "local-execution/tensor_type_t.dtg.h" namespace FlexFlow { @@ -17,10 +17,10 @@ struct TaskBinding { TaskBinding() = default; void bind(int, tensor_guid_t const &); - void bind(slot_id_t, tensor_guid_t const &); + void bind(slot_id_t, tensor_guid_t const &); void bind_grad(int, tensor_guid_t const &); - void bind_grad(slot_id_t, tensor_guid_t const &); + void bind_grad(slot_id_t, tensor_guid_t const &); void bind(int, optimizer_tensor_t const &); void bind(slot_id_t, optimizer_tensor_t const &); diff --git a/lib/local-execution/include/local-execution/task_registry.h b/lib/local-execution/include/local-execution/task_registry.h index fa3e558337..1669822c83 100644 --- a/lib/local-execution/include/local-execution/task_registry.h +++ b/lib/local-execution/include/local-execution/task_registry.h @@ -15,8 +15,11 @@ void register_tasks_for_layer(TaskRegistry &, ComputationGraphOpAttrs const &attrs); bool registry_contains_task_for_layer(TaskRegistry const &, - layer_guid_t const &, - OpTaskType const &); + layer_guid_t const &, + OpTaskType const &); + +void register_all_computation_graph_tasks(TaskRegistry &, + ComputationGraph const &); } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/tensor_type_t.variant.toml b/lib/local-execution/include/local-execution/tensor_type_t.variant.toml index d4e525c348..cd3520ee5d 100644 --- a/lib/local-execution/include/local-execution/tensor_type_t.variant.toml +++ b/lib/local-execution/include/local-execution/tensor_type_t.variant.toml @@ -10,6 +10,7 @@ features = [ includes = [ "pcg/tensor_guid_t.dtg.h", "local-execution/optimizer_tensor_t.dtg.h", + "local-execution/gradient_tensor_t.dtg.h", "local-execution/loss_tensor_t.dtg.h" ] @@ -17,6 +18,10 @@ includes = [ type = "::FlexFlow::tensor_guid_t" key = "tensor_guid" +[[values]] +type = "::FlexFlow::gradient_tensor_t" +key = "gradient_tensor" + [[values]] type = "::FlexFlow::optimizer_tensor_t" key = "optimizer_tensor" diff --git a/lib/local-execution/src/gradient_tensor_source.cc b/lib/local-execution/src/gradient_tensor_source.cc new file mode 100644 index 0000000000..28cec16ef9 --- /dev/null +++ b/lib/local-execution/src/gradient_tensor_source.cc @@ -0,0 +1,14 @@ +#include "local-execution/gradient_tensor_source.h" + +namespace FlexFlow { + +size_t GradientTensorSource::next_available_gradient_tensor_id = 0; + +GradientTensorSource::GradientTensorSource() {} + +gradient_tensor_t GradientTensorSource::new_gradient_tensor() { + return gradient_tensor_t{ + GradientTensorSource::next_available_gradient_tensor_id++}; +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/local_args_backing.cc b/lib/local-execution/src/local_args_backing.cc index 0c3cfe70e8..d8a94fb2c5 100644 --- a/lib/local-execution/src/local_args_backing.cc +++ b/lib/local-execution/src/local_args_backing.cc @@ -1,62 +1,43 @@ #include "local-execution/local_args_backing.h" -#include "utils/containers/map_values.h" +#include "local-execution/op_task_to_task_invocation.h" +#include "op-attrs/parallel_tensor_shape.h" #include "utils/containers/contains_key.h" +#include "utils/containers/map_values.h" #include "utils/overload.h" -#include "op-attrs/parallel_tensor_shape.h" namespace FlexFlow { +LocalArgsBacking::LocalArgsBacking(RuntimeArgConfig const &runtime_arg_config) + : runtime_arg_config(runtime_arg_config){}; -void LocalArgsBacking::add_per_device_op_state( - layer_guid_t const &op_guid, - DeviceSpecificDeviceStates const &device_state) { - this->per_device_op_states.insert({op_guid, device_state}); +void add_per_device_op_state(LocalArgsBacking &local_args_backing, + layer_guid_t const &op_guid, + DeviceSpecificDeviceStates const &device_state) { + local_args_backing.per_device_op_states.insert({op_guid, device_state}); } -ArgSlotsBacking LocalArgsBacking::construct_arg_slots_backing( - TaskBinding const &binding) const { +std::optional get_per_device_op_state_if_exists( + LocalArgsBacking const &local_args_backing, + layer_guid_t const &layer_guid) { + if (contains_key(local_args_backing.per_device_op_states, layer_guid)) { + return local_args_backing.per_device_op_states.at(layer_guid); + } else { + return std::nullopt; + } +} + +ArgSlotsBacking + construct_arg_slots_backing(TaskBinding const &binding, + RuntimeArgConfig const &runtime_arg_config) { return map_values( binding.get_arg_bindings(), [&](TaskArgSpec const &arg_binding) { return arg_binding.template visit( overload{[&](RuntimeArgRefSpec const &s) { - return this->lower_to_concrete_arg_spec(s); + return lower_to_concrete_arg_spec(s, runtime_arg_config); }, [](ConcreteArgSpec const &s) { return s; }}); }); ; } -ConcreteArgSpec LocalArgsBacking::lower_to_concrete_arg_spec( - OpArgRefSpec const &op_arg_ref_spec, ComputationGraph const & cg, layer_guid_t const &op_guid) const { - if (op_arg_ref_spec.holds()) { - assert(contains_key(this->per_device_op_states, op_guid)); - DeviceSpecificDeviceStates device_specific = - per_device_op_states.at(op_guid); - PerDeviceOpState device_state = - get_device_state_from_device_specific(device_specific, 0); - return ConcreteArgSpec::create(device_state); - } else if (op_arg_ref_spec.holds()) { - ParallelTensorShapeRefType index_op_arg_ref = - op_arg_ref_spec.get_ref_type().get(); - tensor_guid_t input_tensor = get_incoming_inputs(cg, op_guid).at(index_op_arg_ref.idx); - TensorAttrs tensor_attrs = get_tensor_attrs(cg, input_tensor); - ParallelTensorShape shape = lift_to_parallel(tensor_attrs.shape); - return ConcreteArgSpec::create(shape); - } else { - throw mk_runtime_error("Unhandled op arg ref type"); - } -} - -ConcreteArgSpec LocalArgsBacking::lower_to_concrete_arg_spec( - RuntimeArgRefSpec const &runtime_arg_ref_spec) const { - if (runtime_arg_ref_spec.holds>()) { - return ConcreteArgSpec::create( - *(this->runtime_arg_config.ff_handle.get(0))); - } else if (runtime_arg_ref_spec.holds()) { - return ConcreteArgSpec::create(this->runtime_arg_config.profiling_settings); - } else { - throw mk_runtime_error("Unhandled runtime arg ref type"); - } -} - -} +} // namespace FlexFlow diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc index b416378e66..b959f31a8b 100644 --- a/lib/local-execution/src/local_cost_estimator.cc +++ b/lib/local-execution/src/local_cost_estimator.cc @@ -1,16 +1,16 @@ #include "local-execution/local_cost_estimator.h" -#include "local-execution/tensor_lowering.h" #include "kernels/device.h" #include "kernels/local_cuda_allocator.h" +#include "local-execution/tensor_lowering.h" #include "local-execution/tracked_allocator.h" #include "op-attrs/computation_graph_op_attrs.h" #include "op-attrs/pcg_operator_attrs.h" #include "pcg/computation_graph/layer_added_result.dtg.h" #include "pcg/computation_graph_builder.h" #include "pcg/parallel_tensor_attrs.h" +#include "utils/containers/sum.h" #include "utils/containers/transform.h" #include "utils/containers/values.h" -#include "utils/containers/sum.h" namespace FlexFlow { @@ -64,9 +64,8 @@ CostDetails LocalCostEstimator::estimate_cost( }), get_vector_piece_attrs(outputs)); - LocalTrainingBacking local_backing(allocator, - cg_builder.computation_graph, - this->runtime_arg_config); + LocalTrainingBacking local_backing( + allocator, cg_builder.computation_graph, this->runtime_arg_config); local_backing.register_and_allocate_layer(layer_added_result.layer); local_backing.execute_init(layer_added_result.layer); float fwd = local_backing.execute_forward(layer_added_result.layer).value(); diff --git a/lib/local-execution/src/local_tensor_backing.cc b/lib/local-execution/src/local_tensor_backing.cc index 9da74c27b9..de058d88ad 100644 --- a/lib/local-execution/src/local_tensor_backing.cc +++ b/lib/local-execution/src/local_tensor_backing.cc @@ -1,119 +1,235 @@ #include "local-execution/local_tensor_backing.h" +#include "local-execution/slot_grad_id.dtg.h" #include "local-execution/tensor_lowering.h" #include "op-attrs/parallel_tensor_shape.h" #include "pcg/computation_graph.h" +#include "pcg/optimizer_attrs.h" #include "utils/containers/contains_key.h" +#include "utils/containers/keys.h" #include "utils/overload.h" -#include "local-execution/slot_grad_id.dtg.h" namespace FlexFlow { -LocalTensorBacking::LocalTensorBacking() {}; +LocalTensorBacking::LocalTensorBacking( + std::unordered_map const + &allocated_tensor_backings, + std::unordered_set const &allocated_tensor_guids, + std::unordered_map const + &allocated_gradient_mapping, + std::unordered_map> const + &allocated_optimizer_mapping, + std::unordered_set const &allocated_loss_tensors) + : tensor_gradient_mapping(allocated_gradient_mapping), + tensor_optimizer_mapping(allocated_optimizer_mapping) { -void LocalTensorBacking::allocate_layer_tensors( - layer_guid_t const &layer_guid, - ComputationGraph const &computation_graph, - Allocator &allocator) { - this->allocate_tensors_by_role( - TensorRole::INPUT, layer_guid, computation_graph, allocator); - this->allocate_tensors_by_role( - TensorRole::WEIGHT, layer_guid, computation_graph, allocator); - this->allocate_tensors_by_role( - TensorRole::OUTPUT, layer_guid, computation_graph, allocator); -} + // computation graph tensors + for (tensor_guid_t const &allocated_tensor_guid : allocated_tensor_guids) { + lowered_tensor_t lowered_tensor = this->insert_tensor( + allocated_tensor_backings.at(TensorTypeVariant{allocated_tensor_guid})); + this->tensor_lowering_mapping.insert( + {allocated_tensor_guid, lowered_tensor}); + } -void LocalTensorBacking::allocate_tensors_by_role( - TensorRole const &role, - layer_guid_t const &layer_guid, - ComputationGraph const &computation_graph, - Allocator &allocator) { - std::vector tensors; - switch (role) { - case TensorRole::INPUT: - tensors = get_incoming_inputs(computation_graph, layer_guid); - break; - case TensorRole::WEIGHT: - tensors = get_incoming_weights(computation_graph, layer_guid); - break; - case TensorRole::OUTPUT: - tensors = get_outgoing_tensors(computation_graph, layer_guid); - break; - default: - throw mk_runtime_error("Invalid tensor role, got {}", role); + // gradient tensors + for (std::pair const + &tensor_guid_gradient_pair : allocated_gradient_mapping) { + gradient_tensor_t allocated_gradient_tensor = + tensor_guid_gradient_pair.second; + lowered_tensor_t lowered_tensor = + this->insert_tensor(allocated_tensor_backings.at( + TensorTypeVariant{allocated_gradient_tensor})); + this->gradient_tensor_lowering_mapping.insert( + {allocated_gradient_tensor, lowered_tensor}); } - for (tensor_guid_t const &tensor : tensors) { - TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor); - // tensor allocation - if (!contains_key(this->tensor_lowering_mapping, tensor)) { - lowered_tensor_t reduced_tensor = this->lowered_tensor_source.new_lowered_tensor(); - this->tensor_lowering_mapping.insert({tensor, reduced_tensor}); - GenericTensorAccessorW tensor_backing = - allocator.allocate_tensor(tensor_attrs.shape); - this->tensor_backings.insert({reduced_tensor, tensor_backing}); + // optimizer tensors + for (std::pair> const + &tensor_guid_optimizers_pair : allocated_optimizer_mapping) { + for (optimizer_tensor_t const &allocated_optimizer_tensor : + tensor_guid_optimizers_pair.second) { + lowered_tensor_t lowered_tensor = + this->insert_tensor(allocated_tensor_backings.at( + TensorTypeVariant{allocated_optimizer_tensor})); + this->optimizer_tensor_lowering_mapping.insert( + {allocated_optimizer_tensor, lowered_tensor}); } + } + + // loss tensors + for (loss_tensor_t const &allocated_loss_tensor : allocated_loss_tensors) { + lowered_tensor_t lowered_tensor = this->insert_tensor( + allocated_tensor_backings.at(TensorTypeVariant{allocated_loss_tensor})); + this->loss_tensor_lowering_mapping.insert( + {allocated_loss_tensor, lowered_tensor}); + } + + // sanity check that backings match up with the mappings + assert(this->tensor_backings.size() == allocated_tensor_backings.size()); +}; + +lowered_tensor_t LocalTensorBacking::insert_tensor( + GenericTensorAccessorW const &tensor_backing) { + lowered_tensor_t lowered_tensor = + this->lowered_tensor_source.new_lowered_tensor(); + this->tensor_backings.insert({lowered_tensor, tensor_backing}); + return lowered_tensor; +} + +lowered_tensor_t + LocalTensorBacking::allocate_tensor(TensorShape const &tensor_shape, + Allocator &allocator) { + GenericTensorAccessorW tensor_backing = + allocator.allocate_tensor(tensor_shape); + return this->insert_tensor(tensor_backing); +} - // gradient tensor allocation - if (tensor_attrs.create_gradients == CreateGrad::YES && !contains_key(this->gradient_tensor_lowering_mapping, tensor)) { - lowered_tensor_t reduced_tensor = this->lowered_tensor_source.new_lowered_tensor(); - this->gradient_tensor_lowering_mapping.insert({tensor, reduced_tensor}); - GenericTensorAccessorW gradient_tensor_backing = - allocator.allocate_tensor(tensor_attrs.shape); - this->tensor_backings.insert( - {reduced_tensor, gradient_tensor_backing}); +void allocate_tensor_guid(LocalTensorBacking &local_tensor_backing, + tensor_guid_t const &tensor_guid, + TensorShape const &tensor_shape, + Allocator &allocator) { + if (!contains_key(local_tensor_backing.tensor_lowering_mapping, + tensor_guid)) { + lowered_tensor_t lowered_tensor = + local_tensor_backing.allocate_tensor(tensor_shape, allocator); + local_tensor_backing.tensor_lowering_mapping.insert( + {tensor_guid, lowered_tensor}); + } +} + +void allocate_gradient_tensor(LocalTensorBacking &local_tensor_backing, + gradient_tensor_t const &gradient_tensor, + tensor_guid_t const &tensor_guid, + TensorShape const &tensor_shape, + Allocator &allocator) { + if (!contains_key(local_tensor_backing.tensor_gradient_mapping, + tensor_guid)) { + local_tensor_backing.tensor_gradient_mapping.insert( + {tensor_guid, gradient_tensor}); + lowered_tensor_t lowered_tensor = + local_tensor_backing.allocate_tensor(tensor_shape, allocator); + local_tensor_backing.gradient_tensor_lowering_mapping.insert( + {gradient_tensor, lowered_tensor}); + } +} + +void allocate_optimizer_tensors( + LocalTensorBacking &local_tensor_backing, + std::vector const &optimizer_tensors, + tensor_guid_t const &tensor_guid, + TensorShape const &tensor_shape, + Allocator &allocator) { + if (!contains_key(local_tensor_backing.tensor_optimizer_mapping, + tensor_guid)) { + // insert new optimizer tensors into mappings + std::vector optimizer_tensors; + for (optimizer_tensor_t const &optimizer_tensor : optimizer_tensors) { + // allocate lowered tensor + lowered_tensor_t lowered_tensor = + local_tensor_backing.allocate_tensor(tensor_shape, allocator); + local_tensor_backing.optimizer_tensor_lowering_mapping.insert( + {optimizer_tensor, lowered_tensor}); } + local_tensor_backing.tensor_optimizer_mapping.insert( + {tensor_guid, optimizer_tensors}); } } -void LocalTensorBacking::allocate_optimizer_tensors( - tensor_guid_t const &weight, - std::vector const& optimizer_tensors, +void allocate_loss_tensor(LocalTensorBacking &local_tensor_backing, + loss_tensor_t const &loss_tensor, + TensorShape const &tensor_shape, + Allocator &allocator) { + lowered_tensor_t lowered_tensor = + local_tensor_backing.allocate_tensor(tensor_shape, allocator); + local_tensor_backing.loss_tensor_lowering_mapping.insert( + {loss_tensor, lowered_tensor}); +} + +void allocate_all_computation_graph_tensors( + LocalTensorBacking &local_tensor_backing, + GradientTensorSource &gradient_tensor_source, + ComputationGraph const &computation_graph, Allocator &allocator) { - GenericTensorAccessorW weight_backing = this->get_tensor_backing(this->tensor_lowering_mapping.at(weight)); - for (optimizer_tensor_t const & optimizer_tensor: optimizer_tensors) { - // optimizer tensor allocation - if (!contains_key(this->optimizer_tensor_lowering_mapping, optimizer_tensor)) { - lowered_tensor_t buffer_tensor = this->lowered_tensor_source.new_lowered_tensor(); - this->optimizer_tensor_lowering_mapping.insert({optimizer_tensor, buffer_tensor}); - GenericTensorAccessorW buffer_backing = allocator.allocate_tensor( - get_tensor_shape(weight_backing.shape, weight_backing.data_type)); - this->tensor_backings.insert({buffer_tensor, buffer_backing}); + // allocate each layer's tensors and gradient tensors + for (tensor_guid_t const &tensor_guid : get_all_tensors(computation_graph)) { + TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor_guid); + allocate_tensor_guid( + local_tensor_backing, tensor_guid, tensor_attrs.shape, allocator); + + if (tensor_attrs.create_gradients == CreateGrad::YES) { + gradient_tensor_t gradient_tensor = + gradient_tensor_source.new_gradient_tensor(); + allocate_gradient_tensor(local_tensor_backing, + gradient_tensor, + tensor_guid, + tensor_attrs.shape, + allocator); } } } -bool LocalTensorBacking::is_tensor_allocated(lowered_tensor_t const & tensor_id) const { - return contains_key(tensor_backings, tensor_id); +void allocate_all_optimizer_tensors( + LocalTensorBacking &local_tensor_backing, + OptimizerTensorSource &optimizer_tensor_source, + ComputationGraph const &computation_graph, + Allocator &allocator, + OptimizerAttrs const &optimizer_attrs) { + for (tensor_guid_t const &tensor_guid : get_all_tensors(computation_graph)) { + TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor_guid); + if (tensor_attrs.create_gradients == CreateGrad::YES) { + std::vector optimizer_tensors; + for (int i = 0; i < get_num_optimizer_tensors(optimizer_attrs); ++i) { + optimizer_tensors.push_back( + optimizer_tensor_source.new_optimizer_tensor()); + } + allocate_optimizer_tensors(local_tensor_backing, + optimizer_tensors, + tensor_guid, + tensor_attrs.shape, + allocator); + } + } } -GenericTensorAccessorW const &LocalTensorBacking::get_tensor_backing( - lowered_tensor_t const &tensor_id) const { - return this->tensor_backings.at(tensor_id); +loss_tensor_t allocate_loss_tensor(LocalTensorBacking &local_tensor_backing, + LossTensorSource &loss_tensor_source, + TensorShape const &tensor_shape, + Allocator &allocator) { + loss_tensor_t loss_tensor = loss_tensor_source.new_loss_tensor(); + lowered_tensor_t lowered_tensor = + local_tensor_backing.allocate_tensor(tensor_shape, allocator); + local_tensor_backing.loss_tensor_lowering_mapping.insert( + {loss_tensor, lowered_tensor}); + return loss_tensor; } -TensorSlotsBacking LocalTensorBacking::construct_tensor_slots_backing( - TaskBinding const &binding) const { +TensorSlotsBacking construct_tensor_slots_backing( + LocalTensorBacking const &local_tensor_backing, + TaskBinding const &binding) { TensorSlotsBacking mapping; for (auto const &tensor_binding : binding.get_tensor_bindings()) { SlotTensorTypeId slot_tensor_type_id = tensor_binding.first; - lowered_tensor_t tensor_id = [&] { - TensorTypeVariant tensor_type = tensor_binding.second; - if (tensor_type.has() and slot_tensor_type_id.tensor_type == TensorType::FORWARD) { - return this->tensor_lowering_mapping.at(tensor_type.get()); - } else if (tensor_type.has() and slot_tensor_type_id.tensor_type == TensorType::GRADIENT) { - return this->gradient_tensor_lowering_mapping.at(tensor_type.get()); - } else if (tensor_type.has()) { - return this->optimizer_tensor_lowering_mapping.at(tensor_type.get()); - } else if (tensor_type.has()) { - return this->loss_tensor_lowering_mapping.at(tensor_type.get()); - } else { - throw mk_runtime_error(fmt::format("Tensor binding has invalid type")); - } - }(); + lowered_tensor_t lowered_tensor = + tensor_binding.second.visit(overload{ + [&](tensor_guid_t const &t) { + return local_tensor_backing.tensor_lowering_mapping.at(t); + }, + [&](gradient_tensor_t const &t) { + return local_tensor_backing.gradient_tensor_lowering_mapping.at( + t); + }, + [&](optimizer_tensor_t const &t) { + return local_tensor_backing.optimizer_tensor_lowering_mapping.at( + t); + }, + [&](loss_tensor_t const &t) { + return local_tensor_backing.loss_tensor_lowering_mapping.at(t); + }, + }); - GenericTensorAccessorW accessor = this->get_tensor_backing(tensor_id); + GenericTensorAccessorW accessor = + local_tensor_backing.tensor_backings.at(lowered_tensor); mapping.insert({slot_tensor_type_id, accessor}); } diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index 22dc3b8397..4893d9be88 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -1,5 +1,6 @@ #include "local-execution/local_training_backing.h" #include "local-execution/loss_functions.h" +#include "local-execution/op_task_to_task_invocation.h" #include "local-execution/optimizer.h" #include "local-execution/task_invocation.h" #include "local-execution/task_signature_impl.h" @@ -17,127 +18,152 @@ namespace FlexFlow { LocalTrainingBacking::LocalTrainingBacking( Allocator const &allocator, ComputationGraph const &computation_graph, - RuntimeArgConfig const &runtime_arg_config) + LocalTensorBacking const &local_tensor_backing, + LocalArgsBacking const &local_args_backing) : allocator(allocator), computation_graph(computation_graph), - local_args_backing(runtime_arg_config), - task_registry(empty_task_registry()) {}; - -void LocalTrainingBacking::register_and_allocate_layer( - layer_guid_t const &node) { - ComputationGraphOpAttrs attrs = - get_layer_attrs(this->computation_graph, node).attrs; - this->local_tensor_backing.allocate_layer_tensors( - node, this->computation_graph, this->allocator); - register_tasks_for_layer(this->task_registry, node, attrs); -} - -void LocalTrainingBacking::allocate_layer_optimizer_tensors( - layer_guid_t const &node, OptimizerAttrs const &optimizer_attrs) { - ComputationGraphOpAttrs attrs = - get_layer_attrs(this->computation_graph, node).attrs; - if (attrs.has()) { - TaskSignature sig = get_update_signature(optimizer_attrs); - tensor_guid_t weight_tensor = - get_only(get_outgoing_tensors(this->computation_graph, node)); - - std::vector optimizer_tensors; - for (TensorTypeSlotSpec const & tensor_type_slot_spec: values(sig.tensor_guid_slots)) { - optimizer_tensors.push_back(this->optimizer_tensor_source.new_optimizer_tensor()); - } - this->layer_optimizer_tensor_ids.insert({node, optimizer_tensors}); - this->local_tensor_backing.allocate_optimizer_tensors( - weight_tensor, optimizer_tensors, this->allocator); - } + task_registry(empty_task_registry()), + local_tensor_backing(local_tensor_backing), + local_args_backing(local_args_backing) { + allocate_all_computation_graph_tensors(this->local_tensor_backing, + this->gradient_tensor_source, + this->computation_graph, + this->allocator); + register_all_computation_graph_tasks(this->task_registry, + this->computation_graph); } DeviceSpecificDeviceStates - LocalTrainingBacking::call_init_task_impl(task_id_t task_id, - TaskArgumentAccessor const &acc) { - TaskSignatureAndImpl task_sig_impl = - this->task_registry.task_mapping.at(task_id); + call_init_task_impl(TaskRegistry const &task_registry, + task_id_t task_id, + TaskArgumentAccessor const &acc) { + TaskSignatureAndImpl task_sig_impl = task_registry.task_mapping.at(task_id); auto fn = task_sig_impl.impl_function.get().function_ptr; return fn(acc); } -std::optional - LocalTrainingBacking::call_task_impl(task_id_t task_id, - TaskArgumentAccessor acc) { - TaskSignatureAndImpl task_sig_impl = - this->task_registry.task_mapping.at(task_id); +std::optional call_task_impl(TaskRegistry const &task_registry, + task_id_t task_id, + TaskArgumentAccessor acc) { + TaskSignatureAndImpl task_sig_impl = task_registry.task_mapping.at(task_id); auto fn = task_sig_impl.impl_function.get().function_ptr; return fn(acc); } -void LocalTrainingBacking::execute_init(layer_guid_t const &operator_node) { - if (registry_contains_task_for_layer( - this->task_registry, operator_node, OpTaskType::INIT)) { +void execute_init(LocalTrainingBacking &local_training_backing, + layer_guid_t const &operator_node) { + if (registry_contains_task_for_layer(local_training_backing.task_registry, + operator_node, + OpTaskType::INIT)) { ComputationGraphOpAttrs attrs = - get_layer_attrs(this->computation_graph, operator_node).attrs; - - TaskInvocation invocation = this->lower_to_task_invocation(init(attrs), operator_node); + get_layer_attrs(local_training_backing.computation_graph, operator_node) + .attrs; + + TaskInvocation invocation = + lower_to_task_invocation(init(attrs), + operator_node, + local_training_backing.computation_graph, + std::nullopt); TaskArgumentAccessor accessor = - this->get_task_arg_accessor(invocation); - DeviceSpecificDeviceStates device_state = - this->call_init_task_impl(invocation.task_id, accessor); - this->local_args_backing.add_per_device_op_state(operator_node, - device_state); + get_task_arg_accessor(local_training_backing.local_tensor_backing, + local_training_backing.local_args_backing, + invocation); + DeviceSpecificDeviceStates device_state = call_init_task_impl( + local_training_backing.task_registry, invocation.task_id, accessor); + add_per_device_op_state( + local_training_backing.local_args_backing, operator_node, device_state); } } std::optional - LocalTrainingBacking::execute_forward(layer_guid_t const &operator_node) { - if (registry_contains_task_for_layer( - this->task_registry, operator_node, OpTaskType::FWD)) { + execute_forward(LocalTrainingBacking &local_training_backing, + layer_guid_t const &operator_node) { + if (registry_contains_task_for_layer(local_training_backing.task_registry, + operator_node, + OpTaskType::FWD)) { ComputationGraphOpAttrs attrs = - get_layer_attrs(this->computation_graph, operator_node).attrs; - - TaskInvocation invocation = this->lower_to_task_invocation(forward(attrs), operator_node); + get_layer_attrs(local_training_backing.computation_graph, operator_node) + .attrs; + + std::optional device_state = + get_per_device_op_state_if_exists( + local_training_backing.local_args_backing, operator_node); + TaskInvocation invocation = + lower_to_task_invocation(forward(attrs), + operator_node, + local_training_backing.computation_graph, + device_state); TaskArgumentAccessor accessor = - this->get_task_arg_accessor(invocation); - return this->call_task_impl(invocation.task_id, accessor); + get_task_arg_accessor(local_training_backing.local_tensor_backing, + local_training_backing.local_args_backing, + invocation, + local_training_backing.allocator); + return call_task_impl( + local_training_backing.task_registry, invocation.task_id, accessor); } else { return std::nullopt; } } -void LocalTrainingBacking::compute_loss(LossAttrs const &loss_attrs, - tensor_guid_t const &logit_tensor, - loss_tensor_t const &label_tensor) { +void compute_loss(LocalTrainingBacking const &local_training_backing, + LossAttrs const &loss_attrs, + tensor_guid_t const &logit_tensor, + loss_tensor_t const &label_tensor) { TaskInvocation loss_invocation = backward(loss_attrs, logit_tensor, label_tensor); // TODO: https://github.com/flexflow/flexflow-train/issues/1442 // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation)); TaskArgumentAccessor loss_accessor = - this->get_task_arg_accessor(loss_invocation); + get_task_arg_accessor(local_training_backing.local_tensor_backing, + local_training_backing.local_args_backing, + loss_invocation); TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl(); loss_impl_fn.get().function_ptr(loss_accessor); } std::optional - LocalTrainingBacking::execute_backward(layer_guid_t const &operator_node) { - if (registry_contains_task_for_layer( - this->task_registry, operator_node, OpTaskType::BWD)) { + execute_backward(LocalTrainingBacking &local_training_backing, + layer_guid_t const &operator_node) { + if (registry_contains_task_for_layer(local_training_backing.task_registry, + operator_node, + OpTaskType::BWD)) { ComputationGraphOpAttrs attrs = - get_layer_attrs(this->computation_graph, operator_node).attrs; - - TaskInvocation invocation = this->lower_to_task_invocation(backward(attrs), operator_node); + get_layer_attrs(local_training_backing.computation_graph, operator_node) + .attrs; + + std::optional device_state = + get_per_device_op_state_if_exists( + local_training_backing.local_args_backing, operator_node); + TaskInvocation invocation = + lower_to_task_invocation(backward(attrs), + operator_node, + local_training_backing.computation_graph, + device_state); TaskArgumentAccessor accessor = - this->get_task_arg_accessor(invocation); - return this->call_task_impl(invocation.task_id, accessor); + get_task_arg_accessor(local_training_backing.local_tensor_backing, + local_training_backing.local_args_backing, + invocation, + local_training_backing.allocator); + return call_task_impl( + local_training_backing.task_registry, invocation.task_id, accessor); } else { return std::nullopt; } } -void LocalTrainingBacking::execute_update( - layer_guid_t const &node, OptimizerAttrs const &optimizer_attrs) { - LayerAttrs layer_attrs = get_layer_attrs(this->computation_graph, node); +void execute_update(LocalTrainingBacking &local_training_backing, + layer_guid_t const &node, + OptimizerAttrs const &optimizer_attrs) { + LayerAttrs layer_attrs = + get_layer_attrs(local_training_backing.computation_graph, node); if (layer_attrs.attrs.has()) { // get tensors - tensor_guid_t weight_tensor = get_only(get_outgoing_tensors(this->computation_graph, node)); - std::vector optimizer_buffer_tensors = this->layer_optimizer_tensor_ids.at(node); + tensor_guid_t weight_tensor = get_only( + get_outgoing_tensors(local_training_backing.computation_graph, node)); + std::vector optimizer_buffer_tensors = + local_training_backing.local_tensor_backing.tensor_optimizer_mapping.at( + weight_tensor); // get invocation TaskInvocation invocation = get_update_invocation( @@ -148,62 +174,26 @@ void LocalTrainingBacking::execute_update( // execute update TaskArgumentAccessor accessor = - this->get_task_arg_accessor(invocation); + get_task_arg_accessor(local_training_backing.local_tensor_backing, + local_training_backing.local_args_backing, + invocation, + local_training_backing.allocator); TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs); update_impl_fn.get().function_ptr(accessor); } } -TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor( - TaskInvocation const &invocation) const { +TaskArgumentAccessor + get_task_arg_accessor(LocalTensorBacking const &local_tensor_backing, + LocalArgsBacking const &local_args_backing, + TaskInvocation const &invocation, + Allocator &allocator) { TensorSlotsBacking tensor_slots_backing = - this->local_tensor_backing.construct_tensor_slots_backing( - invocation.binding); - ArgSlotsBacking arg_slots_backing = - this->local_args_backing.construct_arg_slots_backing(invocation.binding); + construct_tensor_slots_backing(local_tensor_backing, invocation.binding); + ArgSlotsBacking arg_slots_backing = construct_arg_slots_backing( + invocation.binding, local_args_backing.runtime_arg_config); return TaskArgumentAccessor::create( - this->allocator, tensor_slots_backing, arg_slots_backing); -} - -TaskInvocation LocalTrainingBacking::lower_to_task_invocation(OpTaskInvocation const & op_task_invocation, layer_guid_t const & layer_guid) const { - TaskBinding binding; - // tensors - for (auto const & tensor_binding: op_task_invocation.binding.get_tensor_bindings()) { - tensor_guid_t tensor_to_bind = [&] { - switch (tensor_binding.second.role) { - case TensorRole::INPUT: - return get_incoming_inputs(this->computation_graph, layer_guid).at(tensor_binding.second.idx); - case TensorRole::OUTPUT: - return get_outgoing_tensors(this->computation_graph, layer_guid).at(tensor_binding.second.idx); - case TensorRole::WEIGHT: - return get_incoming_weights(this->computation_graph, layer_guid).at(tensor_binding.second.idx); - default: - throw mk_runtime_error(fmt::format("Invalid tensor role {}", tensor_binding.second.role)); - } - }(); - - if (tensor_binding.first.is_grad == IsGrad::NO) { - binding.bind(tensor_binding.first.slot_id, tensor_to_bind); - } else if (tensor_binding.first.is_grad == IsGrad::YES) { - binding.bind_grad(tensor_binding.first.slot_id, tensor_to_bind); - } else { - throw mk_runtime_error(fmt::format("Invalid value for IsGrad {}", tensor_binding.first.is_grad)); - } - } - - // args - for (auto const & arg_binding: op_task_invocation.binding.get_arg_bindings()) { - if (arg_binding.second.has()) { - ConcreteArgSpec concrete_arg = this->local_args_backing.lower_to_concrete_arg_spec(arg_binding.second.get(), this->computation_graph, layer_guid); - binding.insert_arg_spec(arg_binding.first, TaskArgSpec{concrete_arg}); - } else if (arg_binding.second.has()) { - binding.insert_arg_spec(arg_binding.first, TaskArgSpec{arg_binding.second.get()}); - } else { - binding.insert_arg_spec(arg_binding.first, TaskArgSpec{arg_binding.second.get()}); - } - } - - return TaskInvocation{op_task_invocation.task_id, binding}; + allocator, tensor_slots_backing, arg_slots_backing); } } // namespace FlexFlow diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc index bfb3c0a32b..0a89dfd9d5 100644 --- a/lib/local-execution/src/loss_functions.cc +++ b/lib/local-execution/src/loss_functions.cc @@ -33,9 +33,8 @@ TaskSignature get_loss_bwd_signature() { return sig; } -TaskInvocation backward(LossAttrs const &attrs, - tensor_guid_t logit, - loss_tensor_t label) { +TaskInvocation + backward(LossAttrs const &attrs, tensor_guid_t logit, loss_tensor_t label) { TaskBinding b; b.bind(LOGIT, logit); b.bind(LABEL, label); diff --git a/lib/local-execution/src/loss_tensor_source.cc b/lib/local-execution/src/loss_tensor_source.cc new file mode 100644 index 0000000000..da1efa6b85 --- /dev/null +++ b/lib/local-execution/src/loss_tensor_source.cc @@ -0,0 +1,13 @@ +#include "local-execution/loss_tensor_source.h" + +namespace FlexFlow { + +size_t LossTensorSource::next_available_loss_tensor_id = 0; + +LossTensorSource::LossTensorSource() {} + +loss_tensor_t LossTensorSource::new_loss_tensor() { + return loss_tensor_t{LossTensorSource::next_available_loss_tensor_id++}; +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/lowered_tensor_source.cc b/lib/local-execution/src/lowered_tensor_source.cc index 05960ff5e2..af80aa2335 100644 --- a/lib/local-execution/src/lowered_tensor_source.cc +++ b/lib/local-execution/src/lowered_tensor_source.cc @@ -7,7 +7,8 @@ size_t LoweredTensorSource::next_available_lowered_tensor_id = 0; LoweredTensorSource::LoweredTensorSource() {} lowered_tensor_t LoweredTensorSource::new_lowered_tensor() { - return lowered_tensor_t{LoweredTensorSource::next_available_lowered_tensor_id++}; + return lowered_tensor_t{ + LoweredTensorSource::next_available_lowered_tensor_id++}; } -} +} // namespace FlexFlow diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc index f57c5db73a..6691bd3a03 100644 --- a/lib/local-execution/src/model_training_instance.cc +++ b/lib/local-execution/src/model_training_instance.cc @@ -6,68 +6,66 @@ namespace FlexFlow { ModelTrainingInstance::ModelTrainingInstance( - Allocator const &allocator, - ComputationGraph const &computation_graph, - RuntimeArgConfig const &runtime_arg_config, + LocalTrainingBacking const &local_training_backing, + tensor_guid_t const & logit_tensor, + TensorShape const &label_tensor_shape, LossAttrs const &loss_attrs, - tensor_guid_t const &logit_tensor, - loss_tensor_t const &label_tensor, OptimizerAttrs const &optimizer_attrs) - : computation_graph(computation_graph), - training_backing(allocator, - computation_graph, - runtime_arg_config), - loss_attrs(loss_attrs), logit_tensor(logit_tensor), - label_tensor(label_tensor), optimizer_attrs(optimizer_attrs) { - - // allocate each layer's tensors - for (layer_guid_t const &node : - topological_ordering(this->computation_graph)) { - this->training_backing.register_and_allocate_layer(node); - this->training_backing.allocate_layer_optimizer_tensors( - node, this->optimizer_attrs); - } + : training_backing(local_training_backing), loss_attrs(loss_attrs), + optimizer_attrs(optimizer_attrs), logit_tensor(logit_tensor), + label_tensor( + allocate_loss_tensor(this->training_backing.local_tensor_backing, + this->loss_tensor_source, + label_tensor_shape, + this->training_backing.allocator)) { + allocate_all_optimizer_tensors(this->training_backing.local_tensor_backing, + this->optimizer_tensor_source, + this->training_backing.computation_graph, + this->training_backing.allocator, + this->optimizer_attrs); } -void ModelTrainingInstance::execute_init() { - for (layer_guid_t const &node : - topological_ordering(this->computation_graph)) { - this->training_backing.execute_init(node); +void init(ModelTrainingInstance &model_training_instance) { + for (layer_guid_t const &node : topological_ordering( + model_training_instance.training_backing.computation_graph)) { + execute_init(model_training_instance.training_backing, node); } } -PerLayerElapsedTime ModelTrainingInstance::execute_forward() { +PerLayerElapsedTime forward(ModelTrainingInstance &model_training_instance) { PerLayerElapsedTime per_layer_elapsed_time; - for (layer_guid_t const &node : - topological_ordering(this->computation_graph)) { + for (layer_guid_t const &node : topological_ordering( + model_training_instance.training_backing.computation_graph)) { std::optional elapsed_time = - this->training_backing.execute_forward(node); + execute_forward(model_training_instance.training_backing, node); per_layer_elapsed_time.insert({node, elapsed_time}); } return per_layer_elapsed_time; } -PerLayerElapsedTime ModelTrainingInstance::execute_backward() { - this->training_backing.compute_loss( - this->loss_attrs, this->logit_tensor, this->label_tensor); +PerLayerElapsedTime backward(ModelTrainingInstance &model_training_instance) { + compute_loss(model_training_instance.training_backing, + model_training_instance.loss_attrs, + model_training_instance.logit_tensor, + model_training_instance.label_tensor); PerLayerElapsedTime per_layer_elapsed_time; - for (layer_guid_t const &node : - reversed(topological_ordering(this->computation_graph))) { + for (layer_guid_t const &node : reversed(topological_ordering( + model_training_instance.training_backing.computation_graph))) { std::optional elapsed_time = - this->training_backing.execute_backward(node); + execute_backward(model_training_instance.training_backing, node); per_layer_elapsed_time.insert({node, elapsed_time}); } return per_layer_elapsed_time; } -void ModelTrainingInstance::execute_update() { +void update(ModelTrainingInstance & model_training_instance) { for (layer_guid_t const &node : - topological_ordering(this->computation_graph)) { - this->training_backing.execute_update(node, this->optimizer_attrs); + topological_ordering(model_training_instance.training_backing.computation_graph)) { + execute_update(model_training_instance.training_backing, node, model_training_instance.optimizer_attrs); } - this->optimizer_attrs = - get_optimizer_attrs_for_next_iter(this->optimizer_attrs); + model_training_instance.optimizer_attrs = + get_optimizer_attrs_for_next_iter(model_training_instance.optimizer_attrs); } } // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/op_arg_spec.cc b/lib/local-execution/src/op_arg_spec.cc similarity index 100% rename from lib/local-execution/src/local-execution/op_arg_spec.cc rename to lib/local-execution/src/op_arg_spec.cc diff --git a/lib/local-execution/src/op_task_invocation.cc b/lib/local-execution/src/op_task_invocation.cc index b6771e6eb8..19c8894b05 100644 --- a/lib/local-execution/src/op_task_invocation.cc +++ b/lib/local-execution/src/op_task_invocation.cc @@ -20,8 +20,7 @@ void OpTaskBinding::bind(int slot, OpTensorSpec const &tensor_spec) { } void OpTaskBinding::bind(slot_id_t slot, OpTensorSpec const &tensor_spec) { - this->tensor_bindings.insert( - {SlotGradId{slot, IsGrad::NO}, tensor_spec}); + this->tensor_bindings.insert({SlotGradId{slot, IsGrad::NO}, tensor_spec}); } void OpTaskBinding::bind_grad(int slot, OpTensorSpec const &tensor_spec) { @@ -29,8 +28,7 @@ void OpTaskBinding::bind_grad(int slot, OpTensorSpec const &tensor_spec) { } void OpTaskBinding::bind_grad(slot_id_t slot, OpTensorSpec const &tensor_spec) { - this->tensor_bindings.insert( - {SlotGradId{slot, IsGrad::YES}, tensor_spec}); + this->tensor_bindings.insert({SlotGradId{slot, IsGrad::YES}, tensor_spec}); } void OpTaskBinding::insert_arg_spec(slot_id_t name, OpArgSpec const &arg_spec) { @@ -91,8 +89,8 @@ bool is_tensor_invocation_valid(OpTaskSignature const &sig, OpTaskInvocation const &inv) { auto tensor_bindings = inv.binding.get_tensor_bindings(); for (OpTensorSlotSpec const &op_tensor_slot_spec : sig.get_tensor_slots()) { - SlotGradId tensor_key = SlotGradId{ - op_tensor_slot_spec.name, op_tensor_slot_spec.is_grad}; + SlotGradId tensor_key = + SlotGradId{op_tensor_slot_spec.name, op_tensor_slot_spec.is_grad}; OpTensorSpec op_tensor_spec = tensor_bindings.at(tensor_key); if (is_op_tensor_spec_invalid(op_tensor_slot_spec, op_tensor_spec)) { return false; diff --git a/lib/local-execution/src/op_task_signature.cc b/lib/local-execution/src/op_task_signature.cc index 69b5463a0d..932b330453 100644 --- a/lib/local-execution/src/op_task_signature.cc +++ b/lib/local-execution/src/op_task_signature.cc @@ -12,12 +12,8 @@ void OpTaskSignature::add_input_slot(int name, SlotType slot_type) { } void OpTaskSignature::add_input_slot(slot_id_t name, SlotType slot_type) { - OpTensorSlotSpec op_tensor_slot_spec = - OpTensorSlotSpec{name, - slot_type, - TensorRole::INPUT, - IsGrad::NO, - OpSlotOptions::NECESSARY}; + OpTensorSlotSpec op_tensor_slot_spec = OpTensorSlotSpec{ + name, slot_type, TensorRole::INPUT, IsGrad::NO, OpSlotOptions::NECESSARY}; this->op_tensor_slots.insert(op_tensor_slot_spec); } @@ -27,12 +23,8 @@ void OpTaskSignature::add_optional_input_slot(int name, SlotType slot_type) { void OpTaskSignature::add_optional_input_slot(slot_id_t name, SlotType slot_type) { - OpTensorSlotSpec op_tensor_slot_spec = - OpTensorSlotSpec{name, - slot_type, - TensorRole::INPUT, - IsGrad::NO, - OpSlotOptions::OPTIONAL}; + OpTensorSlotSpec op_tensor_slot_spec = OpTensorSlotSpec{ + name, slot_type, TensorRole::INPUT, IsGrad::NO, OpSlotOptions::OPTIONAL}; this->op_tensor_slots.insert(op_tensor_slot_spec); } @@ -88,12 +80,8 @@ void OpTaskSignature::add_bwd_optional_output_slot(int name, void OpTaskSignature::add_bwd_optional_output_slot(slot_id_t name, SlotType slot_type) { - OpTensorSlotSpec op_tensor_slot_spec = - OpTensorSlotSpec{name, - slot_type, - TensorRole::OUTPUT, - IsGrad::NO, - OpSlotOptions::OPTIONAL}; + OpTensorSlotSpec op_tensor_slot_spec = OpTensorSlotSpec{ + name, slot_type, TensorRole::OUTPUT, IsGrad::NO, OpSlotOptions::OPTIONAL}; this->op_tensor_slots.insert(op_tensor_slot_spec); } @@ -117,12 +105,8 @@ void OpTaskSignature::add_optional_weight_slot(int name, SlotType slot_type) { void OpTaskSignature::add_optional_weight_slot(slot_id_t name, SlotType slot_type) { - OpTensorSlotSpec op_tensor_slot_spec = - OpTensorSlotSpec{name, - slot_type, - TensorRole::WEIGHT, - IsGrad::NO, - OpSlotOptions::OPTIONAL}; + OpTensorSlotSpec op_tensor_slot_spec = OpTensorSlotSpec{ + name, slot_type, TensorRole::WEIGHT, IsGrad::NO, OpSlotOptions::OPTIONAL}; this->op_tensor_slots.insert(op_tensor_slot_spec); } diff --git a/lib/local-execution/src/op_task_to_task_invocation.cc b/lib/local-execution/src/op_task_to_task_invocation.cc new file mode 100644 index 0000000000..eb6dffabc4 --- /dev/null +++ b/lib/local-execution/src/op_task_to_task_invocation.cc @@ -0,0 +1,108 @@ +#include "local-execution/op_task_to_task_invocation.h" +#include "op-attrs/parallel_tensor_shape.h" +#include "pcg/computation_graph.h" + +namespace FlexFlow { + +TaskInvocation lower_to_task_invocation( + OpTaskInvocation const &op_task_invocation, + layer_guid_t const &layer_guid, + ComputationGraph const &computation_graph, + std::optional const &device_states) { + TaskBinding binding; + // tensors + std::vector input_tensors = + get_incoming_inputs(computation_graph, layer_guid); + std::vector output_tensors = + get_outgoing_tensors(computation_graph, layer_guid); + std::vector weight_tensors = + get_incoming_weights(computation_graph, layer_guid); + + for (auto const &tensor_binding : + op_task_invocation.binding.get_tensor_bindings()) { + tensor_guid_t tensor_to_bind = [&] { + OpTensorSpec tensor_binding_spec = tensor_binding.second; + switch (tensor_binding_spec.role) { + case TensorRole::INPUT: + return input_tensors.at(tensor_binding_spec.idx); + case TensorRole::OUTPUT: + return output_tensors.at(tensor_binding_spec.idx); + case TensorRole::WEIGHT: + return weight_tensors.at(tensor_binding_spec.idx); + default: + throw mk_runtime_error( + fmt::format("Invalid tensor role {}", tensor_binding_spec.role)); + } + }(); + + SlotGradId slot_grad_id = tensor_binding.first; + + if (slot_grad_id.is_grad == IsGrad::NO) { + binding.bind(slot_grad_id.slot_id, tensor_to_bind); + } else if (slot_grad_id.is_grad == IsGrad::YES) { + binding.bind_grad(slot_grad_id.slot_id, tensor_to_bind); + } else { + throw mk_runtime_error(fmt::format("Invalid value for IsGrad {}", + tensor_binding.first.is_grad)); + } + } + + // args + for (auto const &arg_binding : + op_task_invocation.binding.get_arg_bindings()) { + if (arg_binding.second.has()) { + ConcreteArgSpec concrete_arg = + lower_to_concrete_arg_spec(arg_binding.second.get(), + computation_graph, + layer_guid, + device_states); + binding.insert_arg_spec(arg_binding.first, TaskArgSpec{concrete_arg}); + } else if (arg_binding.second.has()) { + binding.insert_arg_spec( + arg_binding.first, + TaskArgSpec{arg_binding.second.get()}); + } else { + binding.insert_arg_spec( + arg_binding.first, + TaskArgSpec{arg_binding.second.get()}); + } + } + + return TaskInvocation{op_task_invocation.task_id, binding}; +} + +ConcreteArgSpec lower_to_concrete_arg_spec( + OpArgRefSpec const &op_arg_ref_spec, + ComputationGraph const &cg, + layer_guid_t const &op_guid, + std::optional const &device_states) { + if (op_arg_ref_spec.holds()) { + PerDeviceOpState device_state = + get_device_state_from_device_specific(device_states.value(), 0); + return ConcreteArgSpec::create(device_state); + } else if (op_arg_ref_spec.holds()) { + ParallelTensorShapeRefType index_op_arg_ref = + op_arg_ref_spec.get_ref_type().get(); + tensor_guid_t input_tensor = + get_incoming_inputs(cg, op_guid).at(index_op_arg_ref.idx); + TensorAttrs tensor_attrs = get_tensor_attrs(cg, input_tensor); + ParallelTensorShape shape = lift_to_parallel(tensor_attrs.shape); + return ConcreteArgSpec::create(shape); + } else { + throw mk_runtime_error("Unhandled op arg ref type"); + } +} + +ConcreteArgSpec + lower_to_concrete_arg_spec(RuntimeArgRefSpec const &runtime_arg_ref_spec, + RuntimeArgConfig const &runtime_arg_config) { + if (runtime_arg_ref_spec.holds>()) { + return ConcreteArgSpec::create(*(runtime_arg_config.ff_handle.get(0))); + } else if (runtime_arg_ref_spec.holds()) { + return ConcreteArgSpec::create(runtime_arg_config.profiling_settings); + } else { + throw mk_runtime_error("Unhandled runtime arg ref type"); + } +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/optimizer_tensor_source.cc b/lib/local-execution/src/optimizer_tensor_source.cc index 8adb8ec07b..c241c7f4bd 100644 --- a/lib/local-execution/src/optimizer_tensor_source.cc +++ b/lib/local-execution/src/optimizer_tensor_source.cc @@ -7,7 +7,8 @@ size_t OptimizerTensorSource::next_available_optimizer_tensor_id = 0; OptimizerTensorSource::OptimizerTensorSource() {} optimizer_tensor_t OptimizerTensorSource::new_optimizer_tensor() { - return optimizer_tensor_t{OptimizerTensorSource::next_available_optimizer_tensor_id++}; + return optimizer_tensor_t{ + OptimizerTensorSource::next_available_optimizer_tensor_id++}; } -} +} // namespace FlexFlow diff --git a/lib/local-execution/src/task_binding.cc b/lib/local-execution/src/task_binding.cc index 6fc8449f0b..f0aac85ea1 100644 --- a/lib/local-execution/src/task_binding.cc +++ b/lib/local-execution/src/task_binding.cc @@ -1,49 +1,45 @@ #include "local-execution/task_binding.h" +#include "pcg/tensor_guid_t.dtg.h" #include "utils/containers/contains_key.h" #include "utils/fmt/unordered_map.h" #include "utils/hash/unordered_map.h" -#include "pcg/tensor_guid_t.dtg.h" namespace FlexFlow { -void TaskBinding::bind(int name, - tensor_guid_t const &binding) { +void TaskBinding::bind(int name, tensor_guid_t const &binding) { this->bind(slot_id_t{name}, binding); } -void TaskBinding::bind(slot_id_t name, - tensor_guid_t const &binding) { - this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::FORWARD}, TensorTypeVariant{binding}}); +void TaskBinding::bind(slot_id_t name, tensor_guid_t const &binding) { + this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::FORWARD}, + TensorTypeVariant{binding}}); } -void TaskBinding::bind_grad(int name, - tensor_guid_t const &binding) { +void TaskBinding::bind_grad(int name, tensor_guid_t const &binding) { this->bind(slot_id_t{name}, binding); } -void TaskBinding::bind_grad(slot_id_t name, - tensor_guid_t const &binding) { - this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::GRADIENT}, TensorTypeVariant{binding}}); +void TaskBinding::bind_grad(slot_id_t name, tensor_guid_t const &binding) { + this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::GRADIENT}, + TensorTypeVariant{binding}}); } -void TaskBinding::bind(int name, - optimizer_tensor_t const &binding) { +void TaskBinding::bind(int name, optimizer_tensor_t const &binding) { this->bind(slot_id_t{name}, binding); } -void TaskBinding::bind(slot_id_t name, - optimizer_tensor_t const &binding) { - this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::OPTIMIZER}, TensorTypeVariant{binding}}); +void TaskBinding::bind(slot_id_t name, optimizer_tensor_t const &binding) { + this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::OPTIMIZER}, + TensorTypeVariant{binding}}); } -void TaskBinding::bind(int name, - loss_tensor_t const &binding) { +void TaskBinding::bind(int name, loss_tensor_t const &binding) { this->bind(slot_id_t{name}, binding); } -void TaskBinding::bind(slot_id_t name, - loss_tensor_t const &binding) { - this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::LOSS}, TensorTypeVariant{binding}}); +void TaskBinding::bind(slot_id_t name, loss_tensor_t const &binding) { + this->tensor_bindings.insert( + {SlotTensorTypeId{name, TensorType::LOSS}, TensorTypeVariant{binding}}); } void TaskBinding::insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec) { @@ -91,12 +87,12 @@ std::ostream &operator<<(std::ostream &s, TaskBinding const &x) { namespace std { -size_t hash<::FlexFlow::TaskBinding>::operator() ( - ::FlexFlow::TaskBinding const &s) const { - size_t result = 0; - hash_combine(result, s.get_tensor_bindings()); - hash_combine(result, s.get_arg_bindings()); - return result; - } +size_t hash<::FlexFlow::TaskBinding>::operator()( + ::FlexFlow::TaskBinding const &s) const { + size_t result = 0; + hash_combine(result, s.get_tensor_bindings()); + hash_combine(result, s.get_arg_bindings()); + return result; +} } // namespace std diff --git a/lib/local-execution/src/task_registry.cc b/lib/local-execution/src/task_registry.cc index be1cf73e11..9b7b55633c 100644 --- a/lib/local-execution/src/task_registry.cc +++ b/lib/local-execution/src/task_registry.cc @@ -1,5 +1,6 @@ #include "local-execution/task_registry.h" #include "local-execution/task_signature_impl.h" +#include "pcg/computation_graph.h" namespace FlexFlow { @@ -43,8 +44,8 @@ void register_tasks_for_layer(TaskRegistry &task_registry, } bool registry_contains_task_for_layer(TaskRegistry const &task_registry, - layer_guid_t const &op, - OpTaskType const &op_task_type) { + layer_guid_t const &op, + OpTaskType const &op_task_type) { std::unordered_map> task_ids; switch (op_task_type) { case OpTaskType::INIT: @@ -63,4 +64,12 @@ bool registry_contains_task_for_layer(TaskRegistry const &task_registry, return task_ids.at(op).has_value(); } +void register_all_computation_graph_tasks(TaskRegistry ®istry, + ComputationGraph const &cg) { + for (layer_guid_t const &node : topological_ordering(cg)) { + ComputationGraphOpAttrs attrs = get_layer_attrs(cg, node).attrs; + register_tasks_for_layer(registry, node, attrs); + } +} + } // namespace FlexFlow diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc index 88dfa34783..5f7c1ddb91 100644 --- a/lib/local-execution/test/src/test_local_slots_backing.cc +++ b/lib/local-execution/test/src/test_local_slots_backing.cc @@ -1,24 +1,24 @@ #include "kernels/attention_kernels.h" #include "local-execution/local_cost_estimator.h" #include "local-execution/local_cpu_allocator.h" -#include "local-execution/local_slots_backing.h" -#include "local-execution/tensor_reduction.h" +#include "local-execution/local_tensor_backing.h" +#include "local-execution/tensor_lowering.h" #include "op-attrs/ops/attention.h" #include "op-attrs/parallel_tensor_shape.h" #include "pcg/computation_graph.h" #include "pcg/computation_graph_builder.h" -#include "utils/containers/get_only.h" #include "test/utils/doctest/fmt/pair.h" #include "test/utils/doctest/fmt/unordered_map.h" #include "test/utils/doctest/fmt/variant.h" #include "test/utils/doctest/fmt/vector.h" #include "test_utils.h" +#include "utils/containers/get_only.h" #include using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("LocalSlotsBacking -- Attention Op") { + TEST_CASE("LocalTensorBacking -- Attention Op") { // allocate input memory Allocator allocator = create_local_cpu_memory_allocator(); int embed_dim = 32; @@ -69,10 +69,10 @@ TEST_SUITE(FF_TEST_SUITE) { get_layer_by_name(cg_builder.computation_graph, layer_name); LayerTensorBackingMap layer_tensor_backing_map = { - {LayerTensorKey{layer_guid, lower(query_guid)}, query}, - {LayerTensorKey{layer_guid, lower(key_guid)}, key}, - {LayerTensorKey{layer_guid, lower(value_guid)}, value}, - //{LayerTensorKey{layer_guid, lower(output_guid), output}} + {LayerTensorKey{layer_guid, lower(query_guid)}, query}, + {LayerTensorKey{layer_guid, lower(key_guid)}, key}, + {LayerTensorKey{layer_guid, lower(value_guid)}, value}, + //{LayerTensorKey{layer_guid, lower(output_guid), output}} }; // runtime arg config @@ -84,13 +84,13 @@ TEST_SUITE(FF_TEST_SUITE) { EnableProfiling::NO, settings}; - LocalSlotsBacking local_slots_backing = {layer_tensor_backing_map, - TensorBackingMap{}, - runtime_arg_config}; + LocalTensorBacking local_tensor_backing = { + layer_tensor_backing_map, TensorBackingMap{}, runtime_arg_config}; - SUBCASE("LocalSlotsBacking::allocate_tensors_by_role") { + SUBCASE("LocalTensorBacking::allocate_tensors_by_role") { auto get_result_shape_and_dtype_for_tensor_guid_and_map = - [&](tensor_guid_t t, layer_guid_t l, + [&](tensor_guid_t t, + layer_guid_t l, LayerTensorBackingMap m) -> std::pair { GenericTensorAccessorW accessor = m.at(LayerTensorKey{l, lower(t)}); return get_shape_and_datatype(accessor); @@ -99,7 +99,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("Input (QKV) and gradient tensors allocation") { // allocate all tensors from input nodes - local_slots_backing.allocate_tensors_by_role( + local_tensor_backing.allocate_tensors_by_role( TensorRole::INPUT, layer_guid, cg_builder.computation_graph, @@ -108,7 +108,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("Query grad") { std::pair result = get_result_shape_and_dtype_for_tensor_guid_and_map( - query_guid, layer_guid, local_slots_backing.gradient_tensor_mapping); + query_guid, + layer_guid, + local_tensor_backing.gradient_tensor_mapping); std::pair correct = {ArrayShape{query_shape}, dtype}; CHECK(result == correct); @@ -116,7 +118,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("Key grad") { std::pair result = get_result_shape_and_dtype_for_tensor_guid_and_map( - key_guid, layer_guid, local_slots_backing.gradient_tensor_mapping); + key_guid, + layer_guid, + local_tensor_backing.gradient_tensor_mapping); std::pair correct = {ArrayShape{key_shape}, dtype}; CHECK(result == correct); @@ -124,14 +128,16 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("Value grad") { std::pair result = get_result_shape_and_dtype_for_tensor_guid_and_map( - value_guid, layer_guid, local_slots_backing.gradient_tensor_mapping); + value_guid, + layer_guid, + local_tensor_backing.gradient_tensor_mapping); std::pair correct = {ArrayShape{value_shape}, dtype}; CHECK(result == correct); } } SUBCASE("Output and gradient tensors allocation") { - local_slots_backing.allocate_tensors_by_role( + local_tensor_backing.allocate_tensors_by_role( TensorRole::OUTPUT, layer_guid, cg_builder.computation_graph, @@ -139,7 +145,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("Output") { std::pair result = get_result_shape_and_dtype_for_tensor_guid_and_map( - output_guid, layer_guid, local_slots_backing.tensor_mapping); + output_guid, layer_guid, local_tensor_backing.tensor_mapping); std::pair correct = { ArrayShape{ get_tensor_attrs(cg_builder.computation_graph, output_guid) @@ -150,7 +156,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("Output grad") { std::pair result = get_result_shape_and_dtype_for_tensor_guid_and_map( - output_guid, layer_guid, local_slots_backing.gradient_tensor_mapping); + output_guid, + layer_guid, + local_tensor_backing.gradient_tensor_mapping); std::pair correct = { ArrayShape{ get_tensor_attrs(cg_builder.computation_graph, output_guid) @@ -161,31 +169,36 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("Tensor slots") { - local_slots_backing.allocate_layer_tensors( + local_tensor_backing.allocate_layer_tensors( layer_guid, cg_builder.computation_graph, allocator); SUBCASE("Input tensor slots") { - std::vector correct_incoming_input_tensors = - transform(get_incoming_inputs(cg_builder.computation_graph, layer_guid), [&](tensor_guid_t const &tensor_guid) { - return lower(tensor_guid); - }); + std::vector correct_incoming_input_tensors = + transform( + get_incoming_inputs(cg_builder.computation_graph, layer_guid), + [&](tensor_guid_t const &tensor_guid) { + return lower(tensor_guid); + }); CHECK(correct_incoming_input_tensors == - local_slots_backing.input_tensor_slots.at(layer_guid)); + local_tensor_backing.input_tensor_slots.at(layer_guid)); } SUBCASE("Weight tensor slots") { - std::vector correct_incoming_weight_tensors = - transform(get_incoming_weights(cg_builder.computation_graph, layer_guid), [&](tensor_guid_t const &tensor_guid) { - return lower(tensor_guid); - }); + std::vector correct_incoming_weight_tensors = + transform(get_incoming_weights(cg_builder.computation_graph, + layer_guid), + [&](tensor_guid_t const &tensor_guid) { + return lower(tensor_guid); + }); CHECK(correct_incoming_weight_tensors == - local_slots_backing.weight_tensor_slots.at(layer_guid)); + local_tensor_backing.weight_tensor_slots.at(layer_guid)); } SUBCASE("Output tensor slots") { - std::vector correct_output_tensors = - transform(get_outgoing_tensors(cg_builder.computation_graph, layer_guid), [&](tensor_guid_t const &tensor_guid) { - return lower(tensor_guid); - }); + std::vector correct_output_tensors = transform( + get_outgoing_tensors(cg_builder.computation_graph, layer_guid), + [&](tensor_guid_t const &tensor_guid) { + return lower(tensor_guid); + }); CHECK(correct_output_tensors == - local_slots_backing.output_tensor_slots.at(layer_guid)); + local_tensor_backing.output_tensor_slots.at(layer_guid)); } } } @@ -224,14 +237,14 @@ TEST_SUITE(FF_TEST_SUITE) { return b; }(); - local_slots_backing.allocate_layer_tensors( + local_tensor_backing.allocate_layer_tensors( layer_guid, cg_builder.computation_graph, allocator); - SUBCASE("LocalSlotsBacking::construct_tensor_slots_backing") { + SUBCASE("LocalTensorBacking::construct_tensor_slots_backing") { TensorSlotsBackingWithoutAddresses result = get_slots_backing_without_tensor_allocation_addresses( - local_slots_backing.construct_tensor_slots_backing(binding, - layer_guid)); + local_tensor_backing.construct_tensor_slots_backing( + binding, layer_guid)); TensorSlotsBackingWithoutAddresses correct = [&] { TensorShape weights_shape = throw_if_unexpected( get_weights_shape(attrs, query_shape, key_shape, value_shape)); @@ -244,20 +257,25 @@ TEST_SUITE(FF_TEST_SUITE) { allocator.allocate_tensor(output_attrs.shape); return get_slots_backing_without_tensor_allocation_addresses( TensorSlotsBacking{ - {SlotTensorTypeId{slot_id_t{QUERY}, TensorType::FORWARD}, query}, + {SlotTensorTypeId{slot_id_t{QUERY}, TensorType::FORWARD}, + query}, {SlotTensorTypeId{slot_id_t{KEY}, TensorType::FORWARD}, key}, - {SlotTensorTypeId{slot_id_t{VALUE}, TensorType::FORWARD}, value}, - {SlotTensorTypeId{slot_id_t{WEIGHTS}, TensorType::FORWARD}, weights}, - {SlotTensorTypeId{slot_id_t{OUTPUT}, TensorType::FORWARD}, output}, - {SlotTensorTypeId{slot_id_t{QUERY}, TensorType::GRADIENT}, query}}); + {SlotTensorTypeId{slot_id_t{VALUE}, TensorType::FORWARD}, + value}, + {SlotTensorTypeId{slot_id_t{WEIGHTS}, TensorType::FORWARD}, + weights}, + {SlotTensorTypeId{slot_id_t{OUTPUT}, TensorType::FORWARD}, + output}, + {SlotTensorTypeId{slot_id_t{QUERY}, TensorType::GRADIENT}, + query}}); }(); CHECK(result == correct); } - SUBCASE("LocalSlotsBacking::construct_arg_slots_backing") { + SUBCASE("LocalTensorBacking::construct_arg_slots_backing") { ArgSlotsBacking result = - local_slots_backing.construct_arg_slots_backing(binding, - layer_guid); + local_tensor_backing.construct_arg_slots_backing(binding, + layer_guid); ArgSlotsBacking correct = [&] { ParallelTensorShape query_parallel_tensor_shape = @@ -277,10 +295,10 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(result == correct); } - SUBCASE("LocalSlotsBacking::resolve_runtime_arg_ref_spec") { + SUBCASE("LocalTensorBacking::resolve_runtime_arg_ref_spec") { RuntimeArgRefSpec ref_spec = RuntimeArgRefSpec::create(ff_handle()); ConcreteArgSpec arg_spec = - local_slots_backing.resolve_runtime_arg_ref_spec(ref_spec); + local_tensor_backing.resolve_runtime_arg_ref_spec(ref_spec); PerDeviceFFHandle result_handle = arg_spec.get(); CHECK(result_handle == handle); diff --git a/lib/local-execution/test/src/test_local_task_arg_accessor.cc b/lib/local-execution/test/src/test_local_task_arg_accessor.cc index bddda7acd1..979e4360d7 100644 --- a/lib/local-execution/test/src/test_local_task_arg_accessor.cc +++ b/lib/local-execution/test/src/test_local_task_arg_accessor.cc @@ -39,7 +39,8 @@ TEST_SUITE(FF_TEST_SUITE) { TensorSlotsBacking tensor_slots_backing = { {SlotTensorTypeId{slot_id_t{INPUT}, TensorType::FORWARD}, input}, {SlotTensorTypeId{slot_id_t{INPUT}, TensorType::GRADIENT}, input_grad}, - {SlotTensorTypeId{slot_id_t{VARIADIC_TENSORS}, TensorType::FORWARD}, variadic_tensors}, + {SlotTensorTypeId{slot_id_t{VARIADIC_TENSORS}, TensorType::FORWARD}, + variadic_tensors}, {SlotTensorTypeId{slot_id_t{VARIADIC_TENSORS}, TensorType::GRADIENT}, variadic_tensors_grad}, }; @@ -50,45 +51,46 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("get_tensor(slot_id_t, Permissions::RO, TensorType::FORWARD)") { GenericTensorAccessor correct = GenericTensorAccessor{ read_only_accessor_from_write_accessor(input)}; - GenericTensorAccessor result = - acc.get_tensor(slot_id_t{INPUT}, Permissions::RO, TensorType::FORWARD); + GenericTensorAccessor result = acc.get_tensor( + slot_id_t{INPUT}, Permissions::RO, TensorType::FORWARD); CHECK(correct == result); } SUBCASE("get_tensor(slot_id_t, Permissions::RO, TensorType::GRADIENT)") { GenericTensorAccessor correct = GenericTensorAccessor{ read_only_accessor_from_write_accessor(input_grad)}; - GenericTensorAccessor result = - acc.get_tensor(slot_id_t{INPUT}, Permissions::RO, TensorType::GRADIENT); + GenericTensorAccessor result = acc.get_tensor( + slot_id_t{INPUT}, Permissions::RO, TensorType::GRADIENT); CHECK(correct == result); } SUBCASE("get_tensor(slot_id_t, Permissions::WO, TensorType::FORWARD)") { GenericTensorAccessor correct = GenericTensorAccessor{input}; - GenericTensorAccessor result = - acc.get_tensor(slot_id_t{INPUT}, Permissions::WO, TensorType::FORWARD); + GenericTensorAccessor result = acc.get_tensor( + slot_id_t{INPUT}, Permissions::WO, TensorType::FORWARD); CHECK(correct == result); } SUBCASE("get_tensor(slot_id_t, Permissions::WO, TensorType::GRADIENT)") { GenericTensorAccessor correct = GenericTensorAccessor{input_grad}; - GenericTensorAccessor result = - acc.get_tensor(slot_id_t{INPUT}, Permissions::WO, TensorType::GRADIENT); + GenericTensorAccessor result = acc.get_tensor( + slot_id_t{INPUT}, Permissions::WO, TensorType::GRADIENT); CHECK(correct == result); } SUBCASE("get_tensor(slot_id_t, Permissions::RW, TensorType::FORWARD)") { GenericTensorAccessor correct = GenericTensorAccessor{input}; - GenericTensorAccessor result = - acc.get_tensor(slot_id_t{INPUT}, Permissions::RW, TensorType::FORWARD); + GenericTensorAccessor result = acc.get_tensor( + slot_id_t{INPUT}, Permissions::RW, TensorType::FORWARD); CHECK(correct == result); } SUBCASE("get_tensor(slot_id_t, Permissions::RW, TensorType::GRADIENT)") { GenericTensorAccessor correct = GenericTensorAccessor{input_grad}; - GenericTensorAccessor result = - acc.get_tensor(slot_id_t{INPUT}, Permissions::RW, TensorType::GRADIENT); + GenericTensorAccessor result = acc.get_tensor( + slot_id_t{INPUT}, Permissions::RW, TensorType::GRADIENT); CHECK(correct == result); } } SUBCASE("get_variadic_tensor") { - SUBCASE("get_variadic_tensor(slot_id_t, Permissions::RO, TensorType::FORWARD)") { + SUBCASE("get_variadic_tensor(slot_id_t, Permissions::RO, " + "TensorType::FORWARD)") { VariadicGenericTensorAccessor correct = VariadicGenericTensorAccessor{std::vector{ read_only_accessor_from_write_accessor(variadic_tensors.at(0)), @@ -98,7 +100,8 @@ TEST_SUITE(FF_TEST_SUITE) { slot_id_t{VARIADIC_TENSORS}, Permissions::RO, TensorType::FORWARD); CHECK(result == correct); } - SUBCASE("get_variadic_tensor(slot_id_t, Permissions::RO, TensorType::GRADIENT)") { + SUBCASE("get_variadic_tensor(slot_id_t, Permissions::RO, " + "TensorType::GRADIENT)") { VariadicGenericTensorAccessor correct = VariadicGenericTensorAccessor{std::vector{ read_only_accessor_from_write_accessor( @@ -109,28 +112,32 @@ TEST_SUITE(FF_TEST_SUITE) { slot_id_t{VARIADIC_TENSORS}, Permissions::RO, TensorType::GRADIENT); CHECK(result == correct); } - SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, TensorType::FORWARD)") { + SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, " + "TensorType::FORWARD)") { VariadicGenericTensorAccessor correct = VariadicGenericTensorAccessor{variadic_tensors}; VariadicGenericTensorAccessor result = acc.get_variadic_tensor( slot_id_t{VARIADIC_TENSORS}, Permissions::WO, TensorType::FORWARD); CHECK(result == correct); } - SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, TensorType::GRADIENT)") { + SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, " + "TensorType::GRADIENT)") { VariadicGenericTensorAccessor correct = VariadicGenericTensorAccessor{variadic_tensors_grad}; VariadicGenericTensorAccessor result = acc.get_variadic_tensor( slot_id_t{VARIADIC_TENSORS}, Permissions::WO, TensorType::GRADIENT); CHECK(result == correct); } - SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, TensorType::FORWARD)") { + SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, " + "TensorType::FORWARD)") { VariadicGenericTensorAccessor correct = VariadicGenericTensorAccessor{variadic_tensors}; VariadicGenericTensorAccessor result = acc.get_variadic_tensor( slot_id_t{VARIADIC_TENSORS}, Permissions::RW, TensorType::FORWARD); CHECK(result == correct); } - SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, TensorType::GRADIENT)") { + SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, " + "TensorType::GRADIENT)") { VariadicGenericTensorAccessor correct = VariadicGenericTensorAccessor{variadic_tensors_grad}; VariadicGenericTensorAccessor result = acc.get_variadic_tensor( diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc index 5793d02f31..210cd1af83 100644 --- a/lib/local-execution/test/src/test_loss_e2e.cc +++ b/lib/local-execution/test/src/test_loss_e2e.cc @@ -1,9 +1,9 @@ #include "doctest/doctest.h" -#include "local-execution/tensor_reduction.h" #include "kernels/local_cuda_allocator.h" #include "kernels/managed_ff_stream.h" #include "kernels/managed_per_device_ff_handle.h" #include "local-execution/local_training_backing.h" +#include "local-execution/tensor_lowering.h" #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" #include "pcg/computation_graph.h" #include "pcg/computation_graph_builder.h" @@ -36,7 +36,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { std::string layer_name = "scalar multiply"; tensor_guid_t logit_tensor = cg_builder.scalar_multiply(input_tensor, scalar, layer_name); - layer_guid_t layer_guid = get_layer_by_name(cg_builder.computation_graph, layer_name); + layer_guid_t layer_guid = + get_layer_by_name(cg_builder.computation_graph, layer_name); // allocate memory Allocator allocator = create_local_cuda_memory_allocator(); @@ -52,37 +53,42 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { SUBCASE("SparseCategoricalCrossEntropyLossAttrs") { TensorShape label_shape = TensorShape{ TensorDims{FFOrdered{batch_size, 1}}, DataType::FLOAT}; - reduced_tensor_t label_tensor = reduced_tensor_t{-1}; + lowered_tensor_t label_tensor = lowered_tensor_t{-1}; GenericTensorAccessorW label_backing = allocator.allocate_tensor(label_shape); - local_backing.local_slots_backing.non_graph_tensor_mapping.insert({label_tensor, label_backing}); + local_backing.local_tensor_backing.non_graph_tensor_mapping.insert( + {label_tensor, label_backing}); LossAttrs loss_attrs = LossAttrs{ SparseCategoricalCrossEntropyLossAttrs{/*replace_labels=*/false}}; local_backing.compute_loss(loss_attrs, lower(logit_tensor), label_tensor); } SUBCASE("NonconfigurableLossAttrs") { - reduced_tensor_t label_tensor = reduced_tensor_t{-1}; + lowered_tensor_t label_tensor = lowered_tensor_t{-1}; GenericTensorAccessorW label_backing = allocator.allocate_tensor(input_shape); - local_backing.local_slots_backing.non_graph_tensor_mapping.insert({label_tensor, label_backing}); + local_backing.local_tensor_backing.non_graph_tensor_mapping.insert( + {label_tensor, label_backing}); SUBCASE("LossFunction::CATEGORICAL_CROSSENTROPY") { LossAttrs loss_attrs = LossAttrs{ NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}; - local_backing.compute_loss(loss_attrs, lower(logit_tensor), label_tensor); + local_backing.compute_loss( + loss_attrs, lower(logit_tensor), label_tensor); } SUBCASE("LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE") { LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{ LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}; - local_backing.compute_loss(loss_attrs, lower(logit_tensor), label_tensor); + local_backing.compute_loss( + loss_attrs, lower(logit_tensor), label_tensor); } SUBCASE("LossFunction::IDENTITY") { LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{LossFunction::IDENTITY}}; - local_backing.compute_loss(loss_attrs, lower(logit_tensor), label_tensor); + local_backing.compute_loss( + loss_attrs, lower(logit_tensor), label_tensor); } } } diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc index 2e5e386a95..d16c5e5b0b 100644 --- a/lib/local-execution/test/src/test_update_e2e.cc +++ b/lib/local-execution/test/src/test_update_e2e.cc @@ -3,7 +3,7 @@ #include "kernels/managed_ff_stream.h" #include "kernels/managed_per_device_ff_handle.h" #include "local-execution/local_training_backing.h" -#include "local-execution/tensor_reduction.h" +#include "local-execution/tensor_lowering.h" #include "pcg/computation_graph.h" #include "pcg/computation_graph_builder.h" #include "pcg/optimizer_attrs.dtg.h" diff --git a/lib/op-attrs/include/op-attrs/operator_attrs.h b/lib/op-attrs/include/op-attrs/operator_attrs.h index 483e735196..d94f7af4fb 100644 --- a/lib/op-attrs/include/op-attrs/operator_attrs.h +++ b/lib/op-attrs/include/op-attrs/operator_attrs.h @@ -9,6 +9,7 @@ #include "op-attrs/ops/combine.h" #include "op-attrs/ops/concat.h" #include "op-attrs/ops/conv_2d.h" +#include "op-attrs/ops/core.h" #include "op-attrs/ops/dropout.h" #include "op-attrs/ops/element_binary.h" #include "op-attrs/ops/element_unary.h" @@ -30,7 +31,6 @@ #include "op-attrs/ops/split.h" #include "op-attrs/ops/topk.h" #include "op-attrs/ops/transpose.h" -#include "op-attrs/ops/core.h" #include "op-attrs/pcg_operator_attrs.dtg.h" #include "utils/record_formatter.h" #include "utils/variant.h" diff --git a/lib/pcg/include/pcg/computation_graph.h b/lib/pcg/include/pcg/computation_graph.h index f70d9f7404..e3a8cc662c 100644 --- a/lib/pcg/include/pcg/computation_graph.h +++ b/lib/pcg/include/pcg/computation_graph.h @@ -37,6 +37,8 @@ std::vector get_incoming_inputs(ComputationGraph const &, std::vector get_incoming_weights(ComputationGraph const &, layer_guid_t const &); +std::unordered_set get_all_tensors(ComputationGraph const &); + std::unordered_set get_subgraph_incoming_edges(ComputationGraph const &, std::unordered_set const &); diff --git a/lib/pcg/include/pcg/optimizer_attrs.h b/lib/pcg/include/pcg/optimizer_attrs.h index 1d74694c29..f18722d1bb 100644 --- a/lib/pcg/include/pcg/optimizer_attrs.h +++ b/lib/pcg/include/pcg/optimizer_attrs.h @@ -7,6 +7,7 @@ namespace FlexFlow { OptimizerAttrs get_optimizer_attrs_for_next_iter(OptimizerAttrs const &old); +int get_num_optimizer_tensors(OptimizerAttrs const &); } // namespace FlexFlow diff --git a/lib/pcg/src/pcg/computation_graph.cc b/lib/pcg/src/pcg/computation_graph.cc index a69e54fd93..d358dc5031 100644 --- a/lib/pcg/src/pcg/computation_graph.cc +++ b/lib/pcg/src/pcg/computation_graph.cc @@ -121,6 +121,11 @@ std::vector get_incoming_weights(ComputationGraph const &cg, return get_incoming_tensors_with_role(cg, l, IncomingTensorRole::WEIGHT); } +std::unordered_set get_all_tensors(ComputationGraph const &cg) { + return transform(get_all_dataflow_outputs(cg.raw_graph), + [](DataflowOutput const &t) { return tensor_guid_t(t); }); +} + std::unordered_set get_subgraph_incoming_edges( ComputationGraph const &cg, std::unordered_set const &subgraph_nodes) { diff --git a/lib/pcg/src/pcg/optimizer_attrs.cc b/lib/pcg/src/pcg/optimizer_attrs.cc index ce2d3d0db7..7a37091428 100644 --- a/lib/pcg/src/pcg/optimizer_attrs.cc +++ b/lib/pcg/src/pcg/optimizer_attrs.cc @@ -1,4 +1,5 @@ #include "pcg/optimizer_attrs.h" +#include "utils/overload.h" namespace FlexFlow { @@ -22,4 +23,16 @@ OptimizerAttrs } } +int get_num_optimizer_tensors(OptimizerAttrs const &attrs) { + return attrs.visit( + overload{[&](SGDOptimizerAttrs const &o) { + if (o.momentum > 0.0f) { + return 1; + } else { + return 0; + } + }, + [&](AdamOptimizerAttrs const &) { return 2; }}); +} + } // namespace FlexFlow From 277f8c268632dfcc5622d96f55b65751d063d736 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Fri, 31 Jan 2025 19:36:45 -0800 Subject: [PATCH 33/91] Update task binding interface and cost estimator --- .../local-execution/local_training_backing.h | 3 +- .../include/local-execution/loss_functions.h | 8 +- .../local-execution/model_training_instance.h | 4 +- .../op_task_to_task_invocation.h | 11 +- .../include/local-execution/optimizer.h | 3 + .../local-execution/task_argument_accessor.h | 45 ++++---- .../include/local-execution/task_binding.h | 12 +- .../include/local-execution/task_registry.h | 1 + .../src/local_cost_estimator.cc | 103 ++++++++++++------ .../src/local_training_backing.cc | 59 ++++++---- lib/local-execution/src/loss_functions.cc | 18 +-- .../src/model_training_instance.cc | 18 +-- .../src/op_task_to_task_invocation.cc | 5 +- lib/local-execution/src/optimizer.cc | 47 +++++--- lib/local-execution/src/task_binding.cc | 19 ++-- .../include/pcg/computation_graph_builder.h | 6 - .../parallel_computation_graph_builder.h | 4 - lib/pcg/src/pcg/computation_graph_builder.cc | 14 +-- .../parallel_computation_graph_builder.cc | 2 +- 19 files changed, 220 insertions(+), 162 deletions(-) diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h index b712be9a93..ef5e7ec41e 100644 --- a/lib/local-execution/include/local-execution/local_training_backing.h +++ b/lib/local-execution/include/local-execution/local_training_backing.h @@ -25,7 +25,6 @@ struct LocalTrainingBacking { ComputationGraph computation_graph; TaskRegistry task_registry; -private: GradientTensorSource gradient_tensor_source; }; @@ -42,7 +41,7 @@ std::optional execute_forward(LocalTrainingBacking &, layer_guid_t const &); std::optional execute_backward(LocalTrainingBacking &, layer_guid_t const &); -void compute_loss(LocalTrainingBacking const &, +void compute_loss(LocalTrainingBacking &, LossAttrs const &, tensor_guid_t const &logit_tensor, loss_tensor_t const &label_tensor); diff --git a/lib/local-execution/include/local-execution/loss_functions.h b/lib/local-execution/include/local-execution/loss_functions.h index f56f2b05b1..b2a6d610c3 100644 --- a/lib/local-execution/include/local-execution/loss_functions.h +++ b/lib/local-execution/include/local-execution/loss_functions.h @@ -16,19 +16,21 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_ #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_ +#include "local-execution/loss_tensor_t.dtg.h" #include "local-execution/task_impl_function.dtg.h" #include "local-execution/task_invocation.dtg.h" #include "local-execution/task_signature.h" #include "op-attrs/ops/loss_functions.h" #include "pcg/tensor_guid_t.dtg.h" -#include "local-execution/loss_tensor_t.dtg.h" namespace FlexFlow { TaskImplFunction get_loss_bwd_task_impl(); TaskSignature get_loss_bwd_signature(); -TaskInvocation - backward(LossAttrs const &, tensor_guid_t logit, loss_tensor_t label); +TaskInvocation backward(LossAttrs const &, + tensor_guid_t logit, + gradient_tensor_t logit_grad, + loss_tensor_t label); } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h index 81aacf2a53..bf0fc1a3c0 100644 --- a/lib/local-execution/include/local-execution/model_training_instance.h +++ b/lib/local-execution/include/local-execution/model_training_instance.h @@ -13,8 +13,8 @@ using PerLayerElapsedTime = struct ModelTrainingInstance { ModelTrainingInstance(LocalTrainingBacking const &, - tensor_guid_t const & logit_tensor, - TensorShape const & label_tensor_shape, + tensor_guid_t const &logit_tensor, + TensorShape const &label_tensor_shape, LossAttrs const &, OptimizerAttrs const &); diff --git a/lib/local-execution/include/local-execution/op_task_to_task_invocation.h b/lib/local-execution/include/local-execution/op_task_to_task_invocation.h index 44e10d4b51..02b3c938b0 100644 --- a/lib/local-execution/include/local-execution/op_task_to_task_invocation.h +++ b/lib/local-execution/include/local-execution/op_task_to_task_invocation.h @@ -10,11 +10,12 @@ namespace FlexFlow { -TaskInvocation - lower_to_task_invocation(OpTaskInvocation const &, - layer_guid_t const &, - ComputationGraph const &, - std::optional const &); +TaskInvocation lower_to_task_invocation( + OpTaskInvocation const &, + layer_guid_t const &, + ComputationGraph const &, + std::unordered_map const &, + std::optional const &); ConcreteArgSpec lower_to_concrete_arg_spec(RuntimeArgRefSpec const &, RuntimeArgConfig const &); diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h index f0dd610a1f..3a092e34c6 100644 --- a/lib/local-execution/include/local-execution/optimizer.h +++ b/lib/local-execution/include/local-execution/optimizer.h @@ -14,18 +14,21 @@ TaskSignature get_update_signature(OptimizerAttrs const &); TaskInvocation get_update_invocation( OptimizerAttrs const &, tensor_guid_t const &weight, + gradient_tensor_t const &weight_grad, std::vector const &grad_buffer_tensors); TaskImplFunction get_update_task_impl(OptimizerAttrs const &); TaskSignature get_sgd_update_signature(); TaskInvocation sgd_update(SGDOptimizerAttrs const &, tensor_guid_t const &weight, + gradient_tensor_t const &weight_grad, optimizer_tensor_t const &sgd_v); TaskImplFunction get_sgd_update_task_impl(); TaskSignature get_adam_update_signature(); TaskInvocation adam_update(AdamOptimizerAttrs const &, tensor_guid_t const &weight, + gradient_tensor_t const &weight_grad, optimizer_tensor_t const &adam_v, optimizer_tensor_t const &adam_m); TaskImplFunction get_adam_update_task_impl(); diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h index 16a63a789b..0cbeaf04c8 100644 --- a/lib/local-execution/include/local-execution/task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/task_argument_accessor.h @@ -50,7 +50,7 @@ struct TaskArgumentAccessor { template privilege_mode_to_accessor get_optimizer_tensor(int slot) const { - return this->get_tensor_grad(slot_id_t{slot}); + return this->get_optimizer_tensor(slot_id_t{slot}); } template @@ -59,17 +59,16 @@ struct TaskArgumentAccessor { this->ptr->get_tensor(slot, PRIV, TensorType::OPTIMIZER)); } - // template - // privilege_mode_to_accessor get_non_graph_tensor(int slot) const { - // return this->get_tensor_grad(slot_id_t{slot}); - // } + template + privilege_mode_to_accessor get_loss_tensor(int slot) const { + return this->get_loss_tensor(slot_id_t{slot}); + } - // template - // privilege_mode_to_accessor get_non_graph_tensor(slot_id_t slot) const - // { - // return std::get>( - // this->ptr->get_tensor(slot, PRIV, TensorType::NON_GRAPH)); - // } + template + privilege_mode_to_accessor get_loss_tensor(slot_id_t slot) const { + return std::get>( + this->ptr->get_tensor(slot, PRIV, TensorType::LOSS)); + } // variadic tensors template @@ -101,7 +100,7 @@ struct TaskArgumentAccessor { template std::vector> get_variadic_optimizer_tensor(int slot) const { - return this->get_variadic_tensor_grad(slot_id_t{slot}); + return this->get_variadic_optimizer_tensor(slot_id_t{slot}); } template @@ -111,18 +110,18 @@ struct TaskArgumentAccessor { this->ptr->get_variadic_tensor(slot, PRIV, TensorType::OPTIMIZER)); } - // template - // std::vector> - // get_variadic_non_graph_tensor(int slot) const { - // return this->get_variadic_tensor_grad(slot_id_t{slot}); - // } + template + std::vector> + get_variadic_loss_tensor(int slot) const { + return this->get_variadic_loss_tensor(slot_id_t{slot}); + } - // template - // std::vector> - // get_variadic_non_graph_tensor(slot_id_t slot) const { - // return std::get>>( - // this->ptr->get_variadic_tensor(slot, PRIV, TensorType::NON_GRAPH)); - // } + template + std::vector> + get_variadic_loss_tensor(slot_id_t slot) const { + return std::get>>( + this->ptr->get_variadic_tensor(slot, PRIV, TensorType::LOSS)); + } Allocator get_allocator() const { return this->ptr->get_allocator(); diff --git a/lib/local-execution/include/local-execution/task_binding.h b/lib/local-execution/include/local-execution/task_binding.h index 21fc813a6b..aba0c01a65 100644 --- a/lib/local-execution/include/local-execution/task_binding.h +++ b/lib/local-execution/include/local-execution/task_binding.h @@ -19,14 +19,14 @@ struct TaskBinding { void bind(int, tensor_guid_t const &); void bind(slot_id_t, tensor_guid_t const &); - void bind_grad(int, tensor_guid_t const &); - void bind_grad(slot_id_t, tensor_guid_t const &); + void bind_grad(int, gradient_tensor_t const &); + void bind_grad(slot_id_t, gradient_tensor_t const &); - void bind(int, optimizer_tensor_t const &); - void bind(slot_id_t, optimizer_tensor_t const &); + void bind_optimizer(int, optimizer_tensor_t const &); + void bind_optimizer(slot_id_t, optimizer_tensor_t const &); - void bind(int, loss_tensor_t const &); - void bind(slot_id_t, loss_tensor_t const &); + void bind_loss(int, loss_tensor_t const &); + void bind_loss(slot_id_t, loss_tensor_t const &); template void bind_arg(int name, T const &t) { diff --git a/lib/local-execution/include/local-execution/task_registry.h b/lib/local-execution/include/local-execution/task_registry.h index 1669822c83..cb717ca2af 100644 --- a/lib/local-execution/include/local-execution/task_registry.h +++ b/lib/local-execution/include/local-execution/task_registry.h @@ -5,6 +5,7 @@ #include "local-execution/op_task_type.dtg.h" #include "local-execution/task_registry.dtg.h" #include "op-attrs/computation_graph_op_attrs.h" +#include "pcg/computation_graph.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc index b959f31a8b..24175a5ee8 100644 --- a/lib/local-execution/src/local_cost_estimator.cc +++ b/lib/local-execution/src/local_cost_estimator.cc @@ -8,6 +8,7 @@ #include "pcg/computation_graph/layer_added_result.dtg.h" #include "pcg/computation_graph_builder.h" #include "pcg/parallel_tensor_attrs.h" +#include "utils/containers/concat_vectors.h" #include "utils/containers/sum.h" #include "utils/containers/transform.h" #include "utils/containers/values.h" @@ -17,6 +18,53 @@ namespace FlexFlow { LocalCostEstimator::LocalCostEstimator(RuntimeArgConfig const &config) : runtime_arg_config(config) {} +static ComputationGraph const & + create_computation_graph_for_local_cost_estimation( + PCGOperatorAttrs const &op, + std::vector const &inputs, + std::vector const &weights, + std::vector const &outputs) { + ComputationGraph computation_graph = make_empty_computation_graph(); + + // create layer for inputs + auto get_vector_piece_attrs_from_parallel_tensor_shape = + [](std::vector const ¶llel_shapes) { + return transform(parallel_shapes, [](ParallelTensorShape const &p) { + return TensorAttrs{ + get_piece_shape(p), std::nullopt, std::nullopt, CreateGrad::YES}; + }); + }; + + LayerAddedResult inputs_layer = + add_layer(computation_graph, + LayerAttrs{ComputationGraphOpAttrs{InputAttrs{}}, "inputs"}, + {}, + get_vector_piece_attrs_from_parallel_tensor_shape(inputs)); + + // create layer for weights + auto get_vector_piece_attrs_from_parallel_tensor_attrs = + [](std::vector const ¶llel_attrs) { + return transform(parallel_attrs, [](ParallelTensorAttrs const &p) { + return get_piece_attrs(p); + }); + }; + + LayerAddedResult weights_layer = + add_layer(computation_graph, + LayerAttrs{ComputationGraphOpAttrs{InputAttrs{}}, "weights"}, + {}, + get_vector_piece_attrs_from_parallel_tensor_attrs(weights)); + + // create operator layer + LayerAddedResult operator_layer = add_layer( + computation_graph, + LayerAttrs{compgraph_op_attrs_from_pcg_op_attrs(op), "operator"}, + concat_vectors(inputs_layer.outputs, weights_layer.outputs), + get_vector_piece_attrs_from_parallel_tensor_attrs(outputs)); + + return computation_graph; +} + CostDetails LocalCostEstimator::estimate_cost( PCGOperatorAttrs const &op, std::vector const &inputs, @@ -29,47 +77,34 @@ CostDetails LocalCostEstimator::estimate_cost( return CostDetails{0, 0}; } - LayerAttrs layer_attrs = - LayerAttrs{compgraph_op_attrs_from_pcg_op_attrs(op), std::nullopt}; + // construct computation graph + ComputationGraph computation_graph = + create_computation_graph_for_local_cost_estimation( + op, inputs, weights, outputs); - // allocate memory for inputs + // allocate memory std::shared_ptr tracked_allocator_ptr = std::make_shared(create_local_cuda_memory_allocator()); Allocator allocator = Allocator(tracked_allocator_ptr); - std::vector input_tensor_ids; - - ComputationGraphBuilder cg_builder; - for (ParallelTensorShape const &input : inputs) { - TensorShape tensor_shape = get_piece_shape(input); - tensor_guid_t tensor_id = - cg_builder.create_input(tensor_shape, CreateGrad::YES); - input_tensor_ids.push_back(tensor_id); - } - auto get_vector_piece_attrs = - [](std::vector const ¶llel_attrs) { - return transform(parallel_attrs, [](ParallelTensorAttrs const &p) { - return get_piece_attrs(p); - }); - }; + LocalTrainingBacking local_backing( + allocator, + computation_graph, + LocalTensorBacking{}, + LocalArgsBacking{this->runtime_arg_config}); - // add operator to graph - LayerAddedResult layer_added_result = - cg_builder.add_layer_and_get_layer_added_result( - layer_attrs, - input_tensor_ids, - transform(get_vector_piece_attrs(weights), - [&](TensorAttrs const &a) { - return cg_builder.create_weight(a); - }), - get_vector_piece_attrs(outputs)); + allocate_all_computation_graph_tensors(local_backing.local_tensor_backing, + local_backing.gradient_tensor_source, + local_backing.computation_graph, + local_backing.allocator); + + // execute layer + layer_guid_t operator_layer_guid = + get_layer_by_name(computation_graph, "operator"); + execute_init(local_backing, operator_layer_guid); + float fwd = execute_forward(local_backing, operator_layer_guid).value(); + float bwd = execute_backward(local_backing, operator_layer_guid).value(); - LocalTrainingBacking local_backing( - allocator, cg_builder.computation_graph, this->runtime_arg_config); - local_backing.register_and_allocate_layer(layer_added_result.layer); - local_backing.execute_init(layer_added_result.layer); - float fwd = local_backing.execute_forward(layer_added_result.layer).value(); - float bwd = local_backing.execute_backward(layer_added_result.layer).value(); float total_execution_time = fwd + bwd; return CostDetails{total_execution_time, diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index 4893d9be88..144596820a 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -60,15 +60,17 @@ void execute_init(LocalTrainingBacking &local_training_backing, get_layer_attrs(local_training_backing.computation_graph, operator_node) .attrs; - TaskInvocation invocation = - lower_to_task_invocation(init(attrs), - operator_node, - local_training_backing.computation_graph, - std::nullopt); + TaskInvocation invocation = lower_to_task_invocation( + init(attrs), + operator_node, + local_training_backing.computation_graph, + local_training_backing.local_tensor_backing.tensor_gradient_mapping, + std::nullopt); TaskArgumentAccessor accessor = get_task_arg_accessor(local_training_backing.local_tensor_backing, local_training_backing.local_args_backing, - invocation); + invocation, + local_training_backing.allocator); DeviceSpecificDeviceStates device_state = call_init_task_impl( local_training_backing.task_registry, invocation.task_id, accessor); add_per_device_op_state( @@ -89,11 +91,12 @@ std::optional std::optional device_state = get_per_device_op_state_if_exists( local_training_backing.local_args_backing, operator_node); - TaskInvocation invocation = - lower_to_task_invocation(forward(attrs), - operator_node, - local_training_backing.computation_graph, - device_state); + TaskInvocation invocation = lower_to_task_invocation( + forward(attrs), + operator_node, + local_training_backing.computation_graph, + local_training_backing.local_tensor_backing.tensor_gradient_mapping, + device_state); TaskArgumentAccessor accessor = get_task_arg_accessor(local_training_backing.local_tensor_backing, local_training_backing.local_args_backing, @@ -106,18 +109,23 @@ std::optional } } -void compute_loss(LocalTrainingBacking const &local_training_backing, +void compute_loss(LocalTrainingBacking &local_training_backing, LossAttrs const &loss_attrs, tensor_guid_t const &logit_tensor, loss_tensor_t const &label_tensor) { - TaskInvocation loss_invocation = - backward(loss_attrs, logit_tensor, label_tensor); + TaskInvocation loss_invocation = backward( + loss_attrs, + logit_tensor, + local_training_backing.local_tensor_backing.tensor_gradient_mapping.at( + logit_tensor), + label_tensor); // TODO: https://github.com/flexflow/flexflow-train/issues/1442 // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation)); TaskArgumentAccessor loss_accessor = get_task_arg_accessor(local_training_backing.local_tensor_backing, local_training_backing.local_args_backing, - loss_invocation); + loss_invocation, + local_training_backing.allocator); TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl(); loss_impl_fn.get().function_ptr(loss_accessor); } @@ -135,11 +143,12 @@ std::optional std::optional device_state = get_per_device_op_state_if_exists( local_training_backing.local_args_backing, operator_node); - TaskInvocation invocation = - lower_to_task_invocation(backward(attrs), - operator_node, - local_training_backing.computation_graph, - device_state); + TaskInvocation invocation = lower_to_task_invocation( + backward(attrs), + operator_node, + local_training_backing.computation_graph, + local_training_backing.local_tensor_backing.tensor_gradient_mapping, + device_state); TaskArgumentAccessor accessor = get_task_arg_accessor(local_training_backing.local_tensor_backing, local_training_backing.local_args_backing, @@ -161,13 +170,19 @@ void execute_update(LocalTrainingBacking &local_training_backing, // get tensors tensor_guid_t weight_tensor = get_only( get_outgoing_tensors(local_training_backing.computation_graph, node)); + + gradient_tensor_t weight_grad_tensor = + local_training_backing.local_tensor_backing.tensor_gradient_mapping.at( + weight_tensor); std::vector optimizer_buffer_tensors = local_training_backing.local_tensor_backing.tensor_optimizer_mapping.at( weight_tensor); // get invocation - TaskInvocation invocation = get_update_invocation( - optimizer_attrs, weight_tensor, optimizer_buffer_tensors); + TaskInvocation invocation = get_update_invocation(optimizer_attrs, + weight_tensor, + weight_grad_tensor, + optimizer_buffer_tensors); // TODO: https://github.com/flexflow/flexflow-train/issues/1442 // assert(is_invocation_valid(get_update_signature(attrs), invocation)); diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc index 0a89dfd9d5..93a792c466 100644 --- a/lib/local-execution/src/loss_functions.cc +++ b/lib/local-execution/src/loss_functions.cc @@ -20,25 +20,27 @@ namespace FlexFlow { -enum Slots { LOGIT, LABEL, ATTRS, PROFILING }; +enum Slots { LOGIT, LABEL, LOGIT_GRAD, ATTRS, PROFILING }; TaskSignature get_loss_bwd_signature() { TaskSignature sig = make_empty_task_signature(); add_slot(sig, LOGIT, TensorType::FORWARD); add_slot(sig, LABEL, TensorType::LOSS); - add_slot(sig, LOGIT, TensorType::GRADIENT); + add_slot(sig, LOGIT_GRAD, TensorType::GRADIENT); add_arg_slot(sig, ATTRS); add_arg_slot(sig, PROFILING); return sig; } -TaskInvocation - backward(LossAttrs const &attrs, tensor_guid_t logit, loss_tensor_t label) { +TaskInvocation backward(LossAttrs const &attrs, + tensor_guid_t logit, + gradient_tensor_t logit_grad, + loss_tensor_t label) { TaskBinding b; b.bind(LOGIT, logit); - b.bind(LABEL, label); - b.bind_grad(LOGIT, logit); + b.bind_loss(LABEL, label); + b.bind_grad(LOGIT_GRAD, logit_grad); b.bind_arg(ATTRS, attrs); b.bind_arg(PROFILING, profiling_settings()); @@ -49,9 +51,9 @@ TaskInvocation static void backward_task_impl(TaskArgumentAccessor const &acc) { auto attrs = acc.get_argument(ATTRS); auto profiling = acc.get_argument(PROFILING); - auto logit_grad = acc.get_tensor_grad(LOGIT); + auto logit_grad = acc.get_tensor_grad(LOGIT_GRAD); auto logit = acc.get_tensor(LOGIT); - auto label = acc.get_tensor(LABEL); + auto label = acc.get_loss_tensor(LABEL); int batch_size = logit.shape.at(legion_dim_t{1}); // assuming logit shape is [batch dim, num classes] diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc index 6691bd3a03..98b8851212 100644 --- a/lib/local-execution/src/model_training_instance.cc +++ b/lib/local-execution/src/model_training_instance.cc @@ -7,7 +7,7 @@ namespace FlexFlow { ModelTrainingInstance::ModelTrainingInstance( LocalTrainingBacking const &local_training_backing, - tensor_guid_t const & logit_tensor, + tensor_guid_t const &logit_tensor, TensorShape const &label_tensor_shape, LossAttrs const &loss_attrs, OptimizerAttrs const &optimizer_attrs) @@ -45,7 +45,7 @@ PerLayerElapsedTime forward(ModelTrainingInstance &model_training_instance) { PerLayerElapsedTime backward(ModelTrainingInstance &model_training_instance) { compute_loss(model_training_instance.training_backing, - model_training_instance.loss_attrs, + model_training_instance.loss_attrs, model_training_instance.logit_tensor, model_training_instance.label_tensor); @@ -59,13 +59,15 @@ PerLayerElapsedTime backward(ModelTrainingInstance &model_training_instance) { return per_layer_elapsed_time; } -void update(ModelTrainingInstance & model_training_instance) { - for (layer_guid_t const &node : - topological_ordering(model_training_instance.training_backing.computation_graph)) { - execute_update(model_training_instance.training_backing, node, model_training_instance.optimizer_attrs); +void update(ModelTrainingInstance &model_training_instance) { + for (layer_guid_t const &node : topological_ordering( + model_training_instance.training_backing.computation_graph)) { + execute_update(model_training_instance.training_backing, + node, + model_training_instance.optimizer_attrs); } - model_training_instance.optimizer_attrs = - get_optimizer_attrs_for_next_iter(model_training_instance.optimizer_attrs); + model_training_instance.optimizer_attrs = get_optimizer_attrs_for_next_iter( + model_training_instance.optimizer_attrs); } } // namespace FlexFlow diff --git a/lib/local-execution/src/op_task_to_task_invocation.cc b/lib/local-execution/src/op_task_to_task_invocation.cc index eb6dffabc4..0e04a2adec 100644 --- a/lib/local-execution/src/op_task_to_task_invocation.cc +++ b/lib/local-execution/src/op_task_to_task_invocation.cc @@ -8,6 +8,8 @@ TaskInvocation lower_to_task_invocation( OpTaskInvocation const &op_task_invocation, layer_guid_t const &layer_guid, ComputationGraph const &computation_graph, + std::unordered_map const + &tensor_gradient_mapping, std::optional const &device_states) { TaskBinding binding; // tensors @@ -40,7 +42,8 @@ TaskInvocation lower_to_task_invocation( if (slot_grad_id.is_grad == IsGrad::NO) { binding.bind(slot_grad_id.slot_id, tensor_to_bind); } else if (slot_grad_id.is_grad == IsGrad::YES) { - binding.bind_grad(slot_grad_id.slot_id, tensor_to_bind); + binding.bind_grad(slot_grad_id.slot_id, + tensor_gradient_mapping.at(tensor_to_bind)); } else { throw mk_runtime_error(fmt::format("Invalid value for IsGrad {}", tensor_binding.first.is_grad)); diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc index 94584dfc95..0c64147bd8 100644 --- a/lib/local-execution/src/optimizer.cc +++ b/lib/local-execution/src/optimizer.cc @@ -5,12 +5,21 @@ namespace FlexFlow { -enum Slots { ATTRS, WEIGHT, SGD_V, PROFILING, ADAM_M, ADAM_V, HANDLE }; +enum Slots { + ATTRS, + WEIGHT, + WEIGHT_GRAD, + SGD_V, + PROFILING, + ADAM_M, + ADAM_V, + HANDLE +}; TaskSignature get_sgd_update_signature() { TaskSignature sig = make_empty_task_signature(); add_slot(sig, WEIGHT, TensorType::FORWARD); - add_slot(sig, WEIGHT, TensorType::GRADIENT); + add_slot(sig, WEIGHT_GRAD, TensorType::GRADIENT); add_slot(sig, SGD_V, TensorType::OPTIMIZER); add_arg_slot(sig, ATTRS); @@ -23,13 +32,14 @@ TaskSignature get_sgd_update_signature() { TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs, tensor_guid_t const &weight, + gradient_tensor_t const &weight_grad, optimizer_tensor_t const &sgd_v) { TaskBinding b; b.bind(WEIGHT, weight); - b.bind_grad(WEIGHT, weight); + b.bind_grad(WEIGHT_GRAD, weight_grad); if (attrs.momentum > 0.0f) { - b.bind(SGD_V, sgd_v); + b.bind_optimizer(SGD_V, sgd_v); } b.bind_arg(ATTRS, attrs); b.bind_arg(PROFILING, profiling_settings()); @@ -44,7 +54,7 @@ TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs, static void sgd_update_task_impl(TaskArgumentAccessor const &acc) { auto attrs = acc.get_argument(ATTRS); - auto weight_grad = acc.get_tensor_grad(WEIGHT); + auto weight_grad = acc.get_tensor_grad(WEIGHT_GRAD); auto weight = acc.get_tensor(WEIGHT); auto profiling = acc.get_argument(PROFILING); @@ -57,7 +67,7 @@ static void sgd_update_task_impl(TaskArgumentAccessor const &acc) { float *sgd_v_ptr; if (attrs.momentum > 0.0f) { - auto sgd_v = acc.get_tensor(SGD_V); + auto sgd_v = acc.get_optimizer_tensor(SGD_V); assert(sgd_v.shape == weight.shape); sgd_v_ptr = sgd_v.get_float_ptr(); } @@ -100,7 +110,7 @@ TaskImplFunction get_sgd_update_task_impl() { TaskSignature get_adam_update_signature() { TaskSignature sig = make_empty_task_signature(); add_slot(sig, WEIGHT, TensorType::FORWARD); - add_slot(sig, WEIGHT, TensorType::GRADIENT); + add_slot(sig, WEIGHT_GRAD, TensorType::GRADIENT); add_slot(sig, ADAM_V, TensorType::OPTIMIZER); add_slot(sig, ADAM_M, TensorType::OPTIMIZER); @@ -114,13 +124,14 @@ TaskSignature get_adam_update_signature() { TaskInvocation adam_update(AdamOptimizerAttrs const &attrs, tensor_guid_t const &weight, + gradient_tensor_t const &weight_grad, optimizer_tensor_t const &adam_v, optimizer_tensor_t const &adam_m) { TaskBinding b; b.bind(WEIGHT, weight); - b.bind_grad(WEIGHT, weight); - b.bind(ADAM_M, adam_m); - b.bind(ADAM_V, adam_v); + b.bind_grad(WEIGHT_GRAD, weight_grad); + b.bind_optimizer(ADAM_M, adam_m); + b.bind_optimizer(ADAM_V, adam_v); b.bind_arg(ATTRS, attrs); b.bind_arg(PROFILING, profiling_settings()); @@ -134,10 +145,10 @@ TaskInvocation adam_update(AdamOptimizerAttrs const &attrs, static void adam_update_task_impl(TaskArgumentAccessor const &acc) { auto attrs = acc.get_argument(ATTRS); - auto weight_grad = acc.get_tensor_grad(WEIGHT); + auto weight_grad = acc.get_tensor_grad(WEIGHT_GRAD); auto weight = acc.get_tensor(WEIGHT); - auto v_tensor = acc.get_tensor(ADAM_V); - auto m_tensor = acc.get_tensor(ADAM_M); + auto v_tensor = acc.get_optimizer_tensor(ADAM_V); + auto m_tensor = acc.get_optimizer_tensor(ADAM_M); auto profiling = acc.get_argument(PROFILING); @@ -195,14 +206,18 @@ TaskSignature get_update_signature(OptimizerAttrs const &attrs) { TaskInvocation get_update_invocation( OptimizerAttrs const &attrs, tensor_guid_t const &weight, + gradient_tensor_t const &weight_grad, std::vector const &grad_buffer_tensors) { return attrs.visit(overload{ [&](SGDOptimizerAttrs const &s) { - return sgd_update(s, weight, grad_buffer_tensors.at(0)); + return sgd_update(s, weight, weight_grad, grad_buffer_tensors.at(0)); }, [&](AdamOptimizerAttrs const &s) { - return adam_update( - s, weight, grad_buffer_tensors.at(0), grad_buffer_tensors.at(1)); + return adam_update(s, + weight, + weight_grad, + grad_buffer_tensors.at(0), + grad_buffer_tensors.at(1)); }}); } diff --git a/lib/local-execution/src/task_binding.cc b/lib/local-execution/src/task_binding.cc index f0aac85ea1..7684511488 100644 --- a/lib/local-execution/src/task_binding.cc +++ b/lib/local-execution/src/task_binding.cc @@ -15,29 +15,30 @@ void TaskBinding::bind(slot_id_t name, tensor_guid_t const &binding) { TensorTypeVariant{binding}}); } -void TaskBinding::bind_grad(int name, tensor_guid_t const &binding) { - this->bind(slot_id_t{name}, binding); +void TaskBinding::bind_grad(int name, gradient_tensor_t const &binding) { + this->bind_grad(slot_id_t{name}, binding); } -void TaskBinding::bind_grad(slot_id_t name, tensor_guid_t const &binding) { +void TaskBinding::bind_grad(slot_id_t name, gradient_tensor_t const &binding) { this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::GRADIENT}, TensorTypeVariant{binding}}); } -void TaskBinding::bind(int name, optimizer_tensor_t const &binding) { - this->bind(slot_id_t{name}, binding); +void TaskBinding::bind_optimizer(int name, optimizer_tensor_t const &binding) { + this->bind_optimizer(slot_id_t{name}, binding); } -void TaskBinding::bind(slot_id_t name, optimizer_tensor_t const &binding) { +void TaskBinding::bind_optimizer(slot_id_t name, + optimizer_tensor_t const &binding) { this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::OPTIMIZER}, TensorTypeVariant{binding}}); } -void TaskBinding::bind(int name, loss_tensor_t const &binding) { - this->bind(slot_id_t{name}, binding); +void TaskBinding::bind_loss(int name, loss_tensor_t const &binding) { + this->bind_loss(slot_id_t{name}, binding); } -void TaskBinding::bind(slot_id_t name, loss_tensor_t const &binding) { +void TaskBinding::bind_loss(slot_id_t name, loss_tensor_t const &binding) { this->tensor_bindings.insert( {SlotTensorTypeId{name, TensorType::LOSS}, TensorTypeVariant{binding}}); } diff --git a/lib/pcg/include/pcg/computation_graph_builder.h b/lib/pcg/include/pcg/computation_graph_builder.h index 585399ea1d..41c4ff5b5c 100644 --- a/lib/pcg/include/pcg/computation_graph_builder.h +++ b/lib/pcg/include/pcg/computation_graph_builder.h @@ -257,12 +257,6 @@ struct ComputationGraphBuilder { std::vector get_outputs(LayerAttrs const &) const; tensor_guid_t get_output(LayerAttrs const &, int idx) const; - LayerAddedResult add_layer_and_get_layer_added_result( - LayerAttrs const &layer, - std::vector const &inputs, - std::vector const &weights, - std::vector const &outputs); - std::vector add_layer(LayerAttrs const &layer, std::vector const &inputs, diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h index 35113553f2..019b120936 100644 --- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h +++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h @@ -179,10 +179,6 @@ struct ParallelComputationGraphBuilder { ParallelComputationGraph pcg; }; -ParallelTensorAttrs - make_weight_attrs(ParallelTensorShape const &shape, - std::optional const &initializer_attrs); - } // namespace FlexFlow #endif diff --git a/lib/pcg/src/pcg/computation_graph_builder.cc b/lib/pcg/src/pcg/computation_graph_builder.cc index 4c619288cb..4a565476bd 100644 --- a/lib/pcg/src/pcg/computation_graph_builder.cc +++ b/lib/pcg/src/pcg/computation_graph_builder.cc @@ -106,7 +106,7 @@ static void check_incoming_tensor_roles(LayerAttrs const &layer, } } -LayerAddedResult ComputationGraphBuilder::add_layer_and_get_layer_added_result( +std::vector ComputationGraphBuilder::add_layer( LayerAttrs const &layer, std::vector const &inputs, std::vector const &weights, @@ -115,17 +115,7 @@ LayerAddedResult ComputationGraphBuilder::add_layer_and_get_layer_added_result( LayerAddedResult added = ::FlexFlow::add_layer( this->computation_graph, layer, concat_vectors(inputs, weights), outputs); - return added; -} - -std::vector ComputationGraphBuilder::add_layer( - LayerAttrs const &layer, - std::vector const &inputs, - std::vector const &weights, - std::vector const &outputs) { - return this - ->add_layer_and_get_layer_added_result(layer, inputs, weights, outputs) - .outputs; + return added.outputs; } tensor_guid_t ComputationGraphBuilder::as_type(tensor_guid_t const &x, diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc index b56156fe8a..ce00ea62f4 100644 --- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc +++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc @@ -20,7 +20,7 @@ static std::string get_default_name(PCGOperatorAttrs const &attrs) { return get_default_name(get_op_type(attrs)); } -ParallelTensorAttrs make_weight_attrs( +static ParallelTensorAttrs make_weight_attrs( ParallelTensorShape const &shape, std::optional const &initializer_attrs) { return ParallelTensorAttrs{ From 6f689a472be91eed310c48217004754f315aab94 Mon Sep 17 00:00:00 2001 From: fruitea Date: Tue, 4 Feb 2025 17:46:01 -0800 Subject: [PATCH 34/91] feat: add Future wrapper for func result --- .../include/realm-backend/task_result.h | 103 ++++++++++++++++++ .../include/realm-backend/task_wrapper.h | 6 +- lib/realm-backend/src/task_result.cc | 35 ++++++ lib/realm-backend/src/task_wrapper.cc | 28 ++--- 4 files changed, 155 insertions(+), 17 deletions(-) create mode 100644 lib/realm-backend/include/realm-backend/task_result.h create mode 100644 lib/realm-backend/src/task_result.cc diff --git a/lib/realm-backend/include/realm-backend/task_result.h b/lib/realm-backend/include/realm-backend/task_result.h new file mode 100644 index 0000000000..5fb158496e --- /dev/null +++ b/lib/realm-backend/include/realm-backend/task_result.h @@ -0,0 +1,103 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_RESULT_H +#define _FLEXFLOW_LOCAL_EXECUTION_TASK_RESULT_H + +#include "realm-backend/driver.h" +#include + +namespace FlexFlow { + +/** + * @brief SharedState class template that holds the state for both the Promise + * and Future objects. It is responsible for storing the result value and + * synchronization between the producer (Promise) and consumer (Future). + */ +template struct SharedState { + // synchronization primitives + Realm::Event event = Realm::Event::NO_EVENT; + // where the result is stored + Realm::RegionInstance inst; + + SharedState() = delete; + SharedState(Realm::Memory); + void set_event(Realm::Event); + void set_value(T &&); + void wait(); + T get_value(); +}; + +// Specialization of SharedState for the `void` type, as it does not carry a +// value. +template <> struct SharedState { + // synchronization primitives + Realm::Event event = Realm::Event::NO_EVENT; + + SharedState() = default; + void set_event(Realm::Event); + void wait(); +}; + +/** + * @brief Future class template that allows retrieving the result from a + * SharedState object. It is used to access the value once the Promise has been + * fulfilled, and provides mechanisms to block the current thread until the + * result is available. + */ +template class Future { +public: + explicit Future(std::shared_ptr> state) + : state_(std::move(state)) {} + explicit Future(T value) : value_(std::move(value)) {} + void set_event(Realm::Event e) { state_->set_event(e); } + T get() { + value_ = state_->get_value(); + return value_; + } + void wait() { state_->wait(); } + +private: + std::shared_ptr> state_; + T value_; +}; + +// Specialization of Future for the `void` type, as it does not carry a value. +template <> class Future { +public: + explicit Future(std::shared_ptr> state) + : state_(std::move(state)) {} + explicit Future() = default; + void set_event(Realm::Event e) { state_->set_event(e); } + void wait() { state_->wait(); } + +private: + std::shared_ptr> state_; +}; + +/** + * @brief Promise class template that allows setting a result in a SharedState + * object. It is used to fulfill a Future with a value, and provides methods to + * notify the waiting Future of completion. + */ +template class Promise { +public: + Promise() = delete; + Promise(Realm::Memory mem) : state_(std::make_shared>(mem)) {} + Future get_future() { return Future(state_); } + void set_value(T &&value) const { state_->set_value(std::move(value)); } + +private: + std::shared_ptr> state_; +}; + +// Specialization of Promise for the `void` type, as it does not carry a value. +template <> class Promise { +public: + Promise() : state_(std::make_shared>()) {} + Future get_future() { return Future(state_); } + +private: + std::shared_ptr> state_; +}; + +} // namespace FlexFlow + +#endif \ No newline at end of file diff --git a/lib/realm-backend/include/realm-backend/task_wrapper.h b/lib/realm-backend/include/realm-backend/task_wrapper.h index bf53ca7e93..89521becf4 100644 --- a/lib/realm-backend/include/realm-backend/task_wrapper.h +++ b/lib/realm-backend/include/realm-backend/task_wrapper.h @@ -2,18 +2,18 @@ #define _FLEXFLOW_REALM_BACKEND_TASK_WRAPPER_H #include "local-execution/task_registry.h" -#include "realm-backend/driver.h" #include "realm-backend/realm_task_argument_accessor.h" +#include "realm-backend/task_result.h" namespace FlexFlow { /* The following are general task wrappers to be invoked by the Realm runtime */ -struct RealmTaskArgs { +template struct RealmTaskArgs { task_id_t task_id; TaskImplFunction impl_function; TaskArgumentAccessor accessor; - void *result; + Promise promise; }; void init_wrapper_task(const void *args, size_t arglen, const void *userdata, diff --git a/lib/realm-backend/src/task_result.cc b/lib/realm-backend/src/task_result.cc new file mode 100644 index 0000000000..05aa1a8a9c --- /dev/null +++ b/lib/realm-backend/src/task_result.cc @@ -0,0 +1,35 @@ +#include "realm-backend/task_result.h" + +namespace FlexFlow { + +/************ SharedState implementation ************/ +template SharedState::SharedState(Realm::Memory mem) { + Realm::Rect<1> bounds(Realm::Point<1>(0), Realm::Point<1>(0)); + this->inst = Realm::RegionInstance::NO_INST; + Realm::RegionInstance::create_instance( + this->inst, mem, bounds, {sizeof(T)}, /*SOA*/ 1, + Realm::ProfilingRequestSet(), Realm::Event::NO_EVENT) + .wait(); +} + +template void SharedState::set_event(Realm::Event e) { + this->event = e; +} + +template void SharedState::set_value(T &&value) { + Realm::GenericAccessor acc(this->inst, 0); + acc[Realm::Point<1>(0)] = std::move(value); +} + +template void SharedState::wait() { this->event.wait(); } + +template T SharedState::get_value() { + wait(); + Realm::GenericAccessor acc(this->inst, 0); + return acc[Realm::Point<1>(0)]; +} + +void SharedState::set_event(Realm::Event e) { this->event = e; } + +void SharedState::wait() { this->event.wait(); } +} // namespace FlexFlow \ No newline at end of file diff --git a/lib/realm-backend/src/task_wrapper.cc b/lib/realm-backend/src/task_wrapper.cc index 7361a24cd9..e58d2611af 100644 --- a/lib/realm-backend/src/task_wrapper.cc +++ b/lib/realm-backend/src/task_wrapper.cc @@ -6,31 +6,31 @@ using namespace Realm; void init_wrapper_task(const void *args, size_t arglen, const void *userdata, size_t userlen, Processor p) { - RealmTaskArgs const &task_args = - *reinterpret_cast(args); + RealmTaskArgs const &task_args = + *reinterpret_cast *>(args); auto fn = - RealmTaskArgs.impl_function.get().function_ptr; - *reinterpret_cast(RealmTaskArgs.result) = - fn(RealmTaskArgs.acc); + task_args.impl_function.get().function_ptr; + DeviceSpecificDeviceStates result = fn(task_args.accessor); + task_args.promise.set_value(std::move(result)); } void fwdbwd_wrapper_task(const void *args, size_t arglen, const void *userdata, size_t userlen, Processor p) { - RealmTaskArgs const &task_args = - *reinterpret_cast(args); + RealmTaskArgs> const &task_args = + *reinterpret_cast> *>(args); auto fn = - RealmTaskArgs.impl_function.get().function_ptr; - *reinterpret_cast *>(RealmTaskArgs.result) = - fn(RealmTaskArgs.acc); + task_args.impl_function.get().function_ptr; + std::optional result = fn(task_args.accessor); + task_args.promise.set_value(std::move(result)); } void generic_wrapper_task(const void *args, size_t arglen, const void *userdata, size_t userlen, Processor p) { - RealmTaskArgs const &task_args = - *reinterpret_cast(args); + RealmTaskArgs const &task_args = + *reinterpret_cast *>(args); auto fn = - RealmTaskArgs.impl_function.get().function_ptr; - fn(RealmTaskArgs.acc); + task_args.impl_function.get().function_ptr; + fn(task_args.accessor); } void register_wrapper_tasks_init(Processor p, task_id_t task_id) { From fe2bc2160e172afaef4bc8f0c4a09a77a55b9763 Mon Sep 17 00:00:00 2001 From: fruitea Date: Tue, 4 Feb 2025 17:48:38 -0800 Subject: [PATCH 35/91] feat: add realm-backend draft impl --- .../realm-backend/model_training_instance.h | 37 +++ .../include/realm-backend/realm_allocator.h | 59 ++++ .../realm-backend/realm_args_backing.h | 15 +- .../realm_task_argument_accessor.h | 10 +- .../realm-backend/realm_tensor_backing.h | 22 +- .../realm-backend/realm_training_backing.h | 62 ++-- .../src/model_training_instance.cc | 87 +++++ lib/realm-backend/src/realm_allocator.cc | 54 +++ lib/realm-backend/src/realm_args_backing.cc | 65 ++++ .../src/realm_task_argument_accessor.cc | 95 ++++++ lib/realm-backend/src/realm_tensor_backing.cc | 127 ++++++++ .../src/realm_training_backing.cc | 307 +++++++++++------- 12 files changed, 778 insertions(+), 162 deletions(-) create mode 100644 lib/realm-backend/include/realm-backend/model_training_instance.h create mode 100644 lib/realm-backend/include/realm-backend/realm_allocator.h create mode 100644 lib/realm-backend/src/model_training_instance.cc create mode 100644 lib/realm-backend/src/realm_allocator.cc create mode 100644 lib/realm-backend/src/realm_args_backing.cc create mode 100644 lib/realm-backend/src/realm_task_argument_accessor.cc create mode 100644 lib/realm-backend/src/realm_tensor_backing.cc diff --git a/lib/realm-backend/include/realm-backend/model_training_instance.h b/lib/realm-backend/include/realm-backend/model_training_instance.h new file mode 100644 index 0000000000..aa3876fb0d --- /dev/null +++ b/lib/realm-backend/include/realm-backend/model_training_instance.h @@ -0,0 +1,37 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_MODEL_TRAINING_INSTANCE_H +#define _FLEXFLOW_LOCAL_EXECUTION_MODEL_TRAINING_INSTANCE_H + +#include "realm-backend/realm_training_backing.h" +#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" +#include "pcg/tensor_guid_t.dtg.h" +#include "local-execution/loss_tensor_t.dtg.h" + +namespace FlexFlow { + +using PerLayerElapsedTime = + std::unordered_map>; + +struct ModelTrainingInstance { + ModelTrainingInstance(ComputationGraph const &, + RuntimeArgConfig const &, + LossAttrs const &, + tensor_guid_t const &logit_tensor, + loss_tensor_t const &label_tensor, + OptimizerAttrs const &); + + void execute_init(); + PerLayerElapsedTime execute_forward(); + PerLayerElapsedTime execute_backward(); + void execute_update(); + + ComputationGraph computation_graph; + RealmTrainingBacking training_backing; + LossAttrs loss_attrs; + tensor_guid_t logit_tensor; + loss_tensor_t label_tensor; + OptimizerAttrs optimizer_attrs; +}; + +} // namespace FlexFlow + +#endif diff --git a/lib/realm-backend/include/realm-backend/realm_allocator.h b/lib/realm-backend/include/realm-backend/realm_allocator.h new file mode 100644 index 0000000000..1e0c7b23c4 --- /dev/null +++ b/lib/realm-backend/include/realm-backend/realm_allocator.h @@ -0,0 +1,59 @@ +#ifndef _FLEXFLOW_REALM_BACKEND_REALM_ALLOCATOR_H +#define _FLEXFLOW_REALM_BACKEND_REALM_ALLOCATOR_H + +#include "realm-backend/driver.h" +#include "realm.h" +#include + +namespace FlexFlow { + +struct RealmAllocatorImpl; + +struct RealmRegion { + Realm::RegionInstance instance; + RealmAllocatorImpl *allocator; +}; + +struct RealmAllocatorImpl { + RealmAllocatorImpl() = delete; + RealmAllocatorImpl(RealmAllocatorImpl const &) = delete; + RealmAllocatorImpl(RealmAllocatorImpl &&) = delete; + RealmAllocatorImpl(Realm::Processor); + ~RealmAllocatorImpl() = default; + + RealmRegion allocate(size_t); + void deallocate(RealmRegion); + +private: + std::unordered_map ptrs; + Realm::Processor proc; + Realm::Memory mem; + std::vector field_sizes = {sizeof(char)}; +}; + +struct RealmAllocator { + RealmAllocator() = delete; + + RealmRegion allocate(size_t); + void deallocate(RealmRegion); + + template + static typename std::enable_if::value, + RealmAllocator>::type + create(Args &&...args) { + return RealmAllocator(std::make_shared(std::forward(args)...)); + } + + RealmAllocator(std::shared_ptr ptr) : i_allocator(ptr) {}; + RealmAllocator(RealmAllocator const &allocator) + : i_allocator(allocator.i_allocator) {}; + +private: + std::shared_ptr i_allocator; +}; + +RealmAllocator create_realm_memory_allocator(Realm::Processor); + +} // namespace FlexFlow + +#endif \ No newline at end of file diff --git a/lib/realm-backend/include/realm-backend/realm_args_backing.h b/lib/realm-backend/include/realm-backend/realm_args_backing.h index 626698cba6..88db880fcb 100644 --- a/lib/realm-backend/include/realm-backend/realm_args_backing.h +++ b/lib/realm-backend/include/realm-backend/realm_args_backing.h @@ -1,22 +1,23 @@ #ifndef _FLEXFLOW_REALM_BACKEND_REALM_ARGS_BACKING_H #define _FLEXFLOW_REALM_BACKEND_REALM_ARGS_BACKING_H -#include "pcg/layer_guid_t.dtg.h" -#include "pcg/computation_graph.h" -#include "local-execution/per_device_op_state.h" #include "local-execution/op_task_invocation.h" +#include "local-execution/per_device_op_state.h" #include "local-execution/runtime_arg_config.h" #include "local-execution/task_invocation.dtg.h" +#include "pcg/computation_graph.h" +#include "pcg/layer_guid_t.dtg.h" #include "realm-backend/realm_task_argument_accessor.h" +#include "realm-backend/task_result.h" namespace FlexFlow { -struct LocalArgsBacking { - LocalArgsBacking(RuntimeArgConfig const &); +struct RealmArgsBacking { + RealmArgsBacking(RuntimeArgConfig const &); public: void add_per_device_op_state(layer_guid_t const &, - DeviceSpecificDeviceStates const &); + Future &&); ArgSlotsBacking construct_arg_slots_backing(TaskBinding const &) const; @@ -32,6 +33,6 @@ struct LocalArgsBacking { RuntimeArgConfig runtime_arg_config; }; -} +} // namespace FlexFlow #endif diff --git a/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h b/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h index ca4bc9db02..5c7ecafd0f 100644 --- a/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h +++ b/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h @@ -13,13 +13,13 @@ using TensorSlotsBacking = std::unordered_map< std::variant>>; using ArgSlotsBacking = std::unordered_map; -struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor { - LocalTaskArgumentAccessor(Allocator const &allocator, +struct RealmTaskArgumentAccessor : public ITaskArgumentAccessor { + RealmTaskArgumentAccessor(Allocator const &allocator, TensorSlotsBacking const &tensor_slots_backing, ArgSlotsBacking const &arg_slots_backing); - LocalTaskArgumentAccessor(LocalTaskArgumentAccessor const &) = delete; - LocalTaskArgumentAccessor(LocalTaskArgumentAccessor &&) = delete; + RealmTaskArgumentAccessor(RealmTaskArgumentAccessor const &) = delete; + RealmTaskArgumentAccessor(RealmTaskArgumentAccessor &&) = delete; ConcreteArgSpec const &get_concrete_arg(slot_id_t) const override; @@ -48,7 +48,7 @@ TensorSlotsBackingWithoutAddresses get_slots_backing_without_tensor_allocation_addresses( TensorSlotsBacking const &); -CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalTaskArgumentAccessor); +CHECK_RC_COPY_VIRTUAL_COMPLIANT(RealmTaskArgumentAccessor); } // namespace FlexFlow diff --git a/lib/realm-backend/include/realm-backend/realm_tensor_backing.h b/lib/realm-backend/include/realm-backend/realm_tensor_backing.h index 2d9fa0bbdf..d9df0dfcb1 100644 --- a/lib/realm-backend/include/realm-backend/realm_tensor_backing.h +++ b/lib/realm-backend/include/realm-backend/realm_tensor_backing.h @@ -4,6 +4,7 @@ #include "kernels/accessor.h" #include "realm-backend/realm_task_argument_accessor.h" +#include "realm-backend/realm_allocator.h" #include "local-execution/task_invocation.dtg.h" #include "local-execution/tensor_role.dtg.h" #include "local-execution/lowered_tensor_t.dtg.h" @@ -16,23 +17,25 @@ namespace FlexFlow { -using TensorBackingMap = - std::unordered_map; +using TensorRegionMap = + std::unordered_map; +using TensorShapeMap = + std::unordered_map; -struct LocalTensorBacking { - LocalTensorBacking(); +struct RealmTensorBacking { + RealmTensorBacking(); public: void allocate_layer_tensors(layer_guid_t const &, ComputationGraph const &, - Allocator &); + RealmAllocator &); void allocate_tensors_by_role(TensorRole const &, layer_guid_t const &, ComputationGraph const &, - Allocator &); + RealmAllocator &); void allocate_optimizer_tensors(tensor_guid_t const &, std::vector const &, - Allocator &); + RealmAllocator &); TensorSlotsBacking construct_tensor_slots_backing(TaskBinding const &) const; @@ -43,13 +46,12 @@ struct LocalTensorBacking { public: // tensors - TensorBackingMap tensor_backings; - + TensorRegionMap tensor_regions; + TensorShapeMap tensor_shapes; std::unordered_map tensor_lowering_mapping; std::unordered_map gradient_tensor_lowering_mapping; std::unordered_map optimizer_tensor_lowering_mapping; std::unordered_map loss_tensor_lowering_mapping; - LoweredTensorSource lowered_tensor_source; }; diff --git a/lib/realm-backend/include/realm-backend/realm_training_backing.h b/lib/realm-backend/include/realm-backend/realm_training_backing.h index e5385a93c3..ddd3bb7ed1 100644 --- a/lib/realm-backend/include/realm-backend/realm_training_backing.h +++ b/lib/realm-backend/include/realm-backend/realm_training_backing.h @@ -1,56 +1,62 @@ #ifndef _FLEXFLOW_REALM_BACKEND_REALM_TRAINING_BACKING_H #define _FLEXFLOW_REALM_BACKEND_REALM_TRAINING_BACKING_H -#include "realm-backend/realm_tensor_backing.h" -#include "realm-backend/realm_args_backing.h" +#include "local-execution/optimizer_tensor_source.h" #include "local-execution/task_registry.h" #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" #include "pcg/computation_graph.dtg.h" #include "pcg/optimizer_attrs.dtg.h" -#include "local-execution/optimizer_tensor_source.h" +#include "realm-backend/driver.h" +#include "realm-backend/realm_allocator.h" +#include "realm-backend/realm_args_backing.h" +#include "realm-backend/realm_tensor_backing.h" +#include "realm-backend/task_wrapper.h" namespace FlexFlow { using PerLayerElapsedTime = std::unordered_map>; -struct LocalTrainingBacking { - LocalTrainingBacking(Allocator const &, - ComputationGraph const &, - RuntimeArgConfig const &); +struct RealmTrainingBacking { + RealmTrainingBacking(ComputationGraph const &, RuntimeArgConfig const &, + Realm::Processor); void register_and_allocate_layer(layer_guid_t const &); void allocate_layer_optimizer_tensors(layer_guid_t const &, OptimizerAttrs const &); void execute_init(layer_guid_t const &); - std::optional execute_forward(layer_guid_t const &); - void compute_loss(LossAttrs const &loss_attrs, - tensor_guid_t const &logit_tensor, - loss_tensor_t const &label_tensor); - std::optional execute_backward(layer_guid_t const &); - void execute_update(layer_guid_t const &, OptimizerAttrs const &); - - TaskArgumentAccessor - get_task_arg_accessor(TaskInvocation const &) const; + Future> execute_forward(layer_guid_t const &); + Future> execute_backward(layer_guid_t const &); + Future execute_update(layer_guid_t const &, OptimizerAttrs const &); + Future compute_loss(LossAttrs const &loss_attrs, + tensor_guid_t const &logit_tensor, + loss_tensor_t const &label_tensor); - TaskInvocation lower_to_task_invocation(OpTaskInvocation const &, layer_guid_t const &) const; + TaskArgumentAccessor get_task_arg_accessor(TaskInvocation const &) const; - LocalTensorBacking local_tensor_backing; - LocalArgsBacking local_args_backing; + TaskInvocation lower_to_task_invocation(OpTaskInvocation const &, + layer_guid_t const &) const; -private: - DeviceSpecificDeviceStates call_init_task_impl(task_id_t, - TaskArgumentAccessor const &); - std::optional call_task_impl(task_id_t, TaskArgumentAccessor); - -private: - Allocator allocator; ComputationGraph computation_graph; TaskRegistry task_registry; - // optimizer + // runtime + Realm::Processor master_proc; + Realm::Memory master_mem; + std::vector worker_procs; + std::unordered_map proc_events; + std::vector allocators; + + // storage + RealmTensorBacking realm_tensor_backing; + RealmArgsBacking realm_args_backing; OptimizerTensorSource optimizer_tensor_source; - std::unordered_map> layer_optimizer_tensor_ids; + std::unordered_map> + layer_optimizer_tensor_ids; + +private: + std::optional call_task_impl(task_id_t, TaskSignatureAndImpl, + TaskArgumentAccessor); }; } // namespace FlexFlow diff --git a/lib/realm-backend/src/model_training_instance.cc b/lib/realm-backend/src/model_training_instance.cc new file mode 100644 index 0000000000..f9c959c389 --- /dev/null +++ b/lib/realm-backend/src/model_training_instance.cc @@ -0,0 +1,87 @@ +#include "pcg/computation_graph.h" +#include "pcg/optimizer_attrs.h" +#include "realm-backend/model_training_instance.h" +#include "utils/containers/reversed.h" + +namespace FlexFlow { + +ModelTrainingInstance::ModelTrainingInstance( + ComputationGraph const &computation_graph, + RuntimeArgConfig const &runtime_arg_config, LossAttrs const &loss_attrs, + tensor_guid_t const &logit_tensor, loss_tensor_t const &label_tensor, + OptimizerAttrs const &optimizer_attrs) + : computation_graph(computation_graph), + training_backing(computation_graph, runtime_arg_config), + loss_attrs(loss_attrs), logit_tensor(logit_tensor), + label_tensor(label_tensor), optimizer_attrs(optimizer_attrs) { + + // allocate each layer's tensors + for (layer_guid_t const &node : + topological_ordering(this->computation_graph)) { + this->training_backing.register_and_allocate_layer(node); + this->training_backing.allocate_layer_optimizer_tensors( + node, this->optimizer_attrs); + } +} + +void ModelTrainingInstance::execute_init() { + for (layer_guid_t const &node : + topological_ordering(this->computation_graph)) { + this->training_backing.execute_init(node); + } +} + +PerLayerElapsedTime ModelTrainingInstance::execute_forward() { + PerLayerElapsedTime per_layer_elapsed_time; + std::unordered_map>> + per_layer_elapsed_time_future; + for (layer_guid_t const &node : + topological_ordering(this->computation_graph)) { + per_layer_elapsed_time_future.insert( + {node, this->training_backing.execute_forward(node)}); + } + for (layer_guid_t const &node : + topological_ordering(this->computation_graph)) { + std::optional elapsed_time = + per_layer_elapsed_time_future[node].get(); + per_layer_elapsed_time.insert({node, elapsed_time}); + } + return per_layer_elapsed_time; +} + +PerLayerElapsedTime ModelTrainingInstance::execute_backward() { + this->training_backing.compute_loss(this->loss_attrs, this->logit_tensor, + this->label_tensor); + PerLayerElapsedTime per_layer_elapsed_time; + std::unordered_map>> + per_layer_elapsed_time_future; + for (layer_guid_t const &node : + reversed(topological_ordering(this->computation_graph))) { + per_layer_elapsed_time_future.insert( + {node, this->training_backing.execute_backward(node)}); + } + for (layer_guid_t const &node : + reversed(topological_ordering(this->computation_graph))) { + std::optional elapsed_time = + per_layer_elapsed_time_future[node].get(); + per_layer_elapsed_time.insert({node, elapsed_time}); + } + return per_layer_elapsed_time; +} + +void ModelTrainingInstance::execute_update() { + std::unordered_map> per_layer_update_future; + for (layer_guid_t const &node : + topological_ordering(this->computation_graph)) { + per_layer_update_future.insert( + {node, this->training_backing.execute_update(node, this->optimizer_attrs)}); + } + for (layer_guid_t const &node : + topological_ordering(this->computation_graph)) { + per_layer_update_future[node].wait(); + } + this->optimizer_attrs = + get_optimizer_attrs_for_next_iter(this->optimizer_attrs); +} + +} // namespace FlexFlow diff --git a/lib/realm-backend/src/realm_allocator.cc b/lib/realm-backend/src/realm_allocator.cc new file mode 100644 index 0000000000..fadc7f5719 --- /dev/null +++ b/lib/realm-backend/src/realm_allocator.cc @@ -0,0 +1,54 @@ +#include "realm-backend/realm_allocator.h" +#include "utils/containers/contains_key.h" + +namespace FlexFlow { + +using namespace Realm; + +/*********** RealmAllocatorImpl ***********/ + +RealmAllocatorImpl::RealmAllocatorImpl(Processor proc) : proc(proc) { + mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::GPU_FB_MEM) + .best_affinity_to(proc) + .first(); +} + +// TODO: now the region instance only corresponds to one tensor +RealmRegion RealmAllocatorImpl::allocate(size_t requested_memory_size) { + Rect<1> bounds(Point<1>(0), Point<1>(requested_memory_size - 1)); + RegionInstance requested_instance = RegionInstance::NO_INST; + RegionInstance::create_instance(requested_instance, mem, bounds, field_sizes, + /*SOA*/ 1, ProfilingRequestSet()) + .wait(); + void *ptr = requested_instance.pointer_untyped(0, 0); + this->ptrs.insert({requested_instance, ptr}); + return {requested_instance, this}; +} + +void RealmAllocatorImpl::deallocate(RealmRegion region) { + if (region.allocator == this and contains_key(this->ptrs, region.instance)) { + RegionInstance instance = this->ptrs.at(region.instance); + instance.destroy(); + } else { + throw std::runtime_error( + "Deallocating a pointer that was not allocated by this Allocator"); + } +} + + +/*********** RealmAllocator ***********/ + +RealmRegion RealmAllocator::allocate(size_t mem_size) { + return this->i_allocator->allocate(mem_size); +} + +void RealmAllocator::deallocate(RealmRegion region) { + this->i_allocator->deallocate(region); +} + +RealmAllocator create_realm_memory_allocator(Processor proc) { + return RealmAllocator::create(proc); +} + +} // namespace FlexFlow diff --git a/lib/realm-backend/src/realm_args_backing.cc b/lib/realm-backend/src/realm_args_backing.cc new file mode 100644 index 0000000000..ae7022f4b0 --- /dev/null +++ b/lib/realm-backend/src/realm_args_backing.cc @@ -0,0 +1,65 @@ +#include "op-attrs/parallel_tensor_shape.h" +#include "realm-backend/realm_args_backing.h" +#include "utils/containers/contains_key.h" +#include "utils/containers/map_values.h" +#include "utils/overload.h" + +namespace FlexFlow { + +void RealmArgsBacking::add_per_device_op_state( + layer_guid_t const &op_guid, Future &&future) { + if (per_device_op_states.find(op_guid) != per_device_op_states.end()) { + throw mk_runtime_error("Op state already exists"); + } + per_device_op_states.insert({op_guid, std::move(future)}); +} + +ArgSlotsBacking RealmArgsBacking::construct_arg_slots_backing( + TaskBinding const &binding) const { + return map_values(binding.get_arg_bindings(), + [&](TaskArgSpec const &arg_binding) { + return arg_binding.template visit( + overload{[&](RuntimeArgRefSpec const &s) { + return this->lower_to_concrete_arg_spec(s); + }, + [](ConcreteArgSpec const &s) { return s; }}); + }); + ; +} + +ConcreteArgSpec RealmArgsBacking::lower_to_concrete_arg_spec( + OpArgRefSpec const &op_arg_ref_spec, ComputationGraph const &cg, + layer_guid_t const &op_guid) const { + if (op_arg_ref_spec.holds()) { + assert(contains_key(this->per_device_op_states, op_guid)); + DeviceSpecificDeviceStates device_specific = + per_device_op_states.at(op_guid); + PerDeviceOpState device_state = + get_device_state_from_device_specific(device_specific, 0); + return ConcreteArgSpec::create(device_state); + } else if (op_arg_ref_spec.holds()) { + ParallelTensorShapeRefType index_op_arg_ref = + op_arg_ref_spec.get_ref_type().get(); + tensor_guid_t input_tensor = + get_incoming_inputs(cg, op_guid).at(index_op_arg_ref.idx); + TensorAttrs tensor_attrs = get_tensor_attrs(cg, input_tensor); + ParallelTensorShape shape = lift_to_parallel(tensor_attrs.shape); + return ConcreteArgSpec::create(shape); + } else { + throw mk_runtime_error("Unhandled op arg ref type"); + } +} + +ConcreteArgSpec RealmArgsBacking::lower_to_concrete_arg_spec( + RuntimeArgRefSpec const &runtime_arg_ref_spec) const { + if (runtime_arg_ref_spec.holds>()) { + return ConcreteArgSpec::create( + *(this->runtime_arg_config.ff_handle.get(0))); + } else if (runtime_arg_ref_spec.holds()) { + return ConcreteArgSpec::create(this->runtime_arg_config.profiling_settings); + } else { + throw mk_runtime_error("Unhandled runtime arg ref type"); + } +} + +} // namespace FlexFlow diff --git a/lib/realm-backend/src/realm_task_argument_accessor.cc b/lib/realm-backend/src/realm_task_argument_accessor.cc new file mode 100644 index 0000000000..7b27bad6c2 --- /dev/null +++ b/lib/realm-backend/src/realm_task_argument_accessor.cc @@ -0,0 +1,95 @@ +#include "realm-backend/realm_task_argument_accessor.h" +#include "utils/containers/contains_key.h" +#include "utils/containers/transform.h" +#include "utils/hash/pair.h" +#include "utils/overload.h" + +namespace FlexFlow { + +RealmTaskArgumentAccessor::RealmTaskArgumentAccessor( + Allocator const &allocator, + TensorSlotsBacking const &tensor_slots_backing, + ArgSlotsBacking const &arg_slots_backing) + : allocator(allocator), tensor_slots_backing(tensor_slots_backing), + arg_slots_backing(arg_slots_backing){}; + +ConcreteArgSpec const & + RealmTaskArgumentAccessor::get_concrete_arg(slot_id_t name) const { + return this->arg_slots_backing.at(name); +} + +GenericTensorAccessor RealmTaskArgumentAccessor::get_tensor( + slot_id_t slot, Permissions priv, TensorType tensor_type) const { + SlotTensorTypeId slot_tensor_type = SlotTensorTypeId{slot, tensor_type}; + auto tensor_backing = std::get( + this->tensor_slots_backing.at(slot_tensor_type)); + if (priv == Permissions::RO) { + GenericTensorAccessorR readonly_tensor_backing = { + tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr}; + return readonly_tensor_backing; + } else if (priv == Permissions::RW || priv == Permissions::WO) { + return tensor_backing; + } else { + throw mk_runtime_error(fmt::format("Unhandled privilege mode {}", priv)); + } +} +VariadicGenericTensorAccessor RealmTaskArgumentAccessor::get_variadic_tensor( + slot_id_t slot, Permissions priv, TensorType tensor_type) const { + SlotTensorTypeId slot_tensor_type = SlotTensorTypeId{slot, tensor_type}; + auto variadic_tensor_backing = std::get>( + this->tensor_slots_backing.at(slot_tensor_type)); + if (priv == Permissions::RO) { + std::vector readonly_variadic_tensor_backing = {}; + for (GenericTensorAccessorW const &tensor_backing : + variadic_tensor_backing) { + readonly_variadic_tensor_backing.push_back( + {tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr}); + } + return readonly_variadic_tensor_backing; + } else if (priv == Permissions::RW || priv == Permissions::WO) { + return variadic_tensor_backing; + } else { + throw mk_runtime_error(fmt::format("Unhandled privilege mode {}", priv)); + } +} + +Allocator RealmTaskArgumentAccessor::get_allocator() const { + return this->allocator; +} + +TensorSlotsBackingWithoutAddresses + get_slots_backing_without_tensor_allocation_addresses( + TensorSlotsBacking const &slots_backing) { + + TensorSlotsBackingWithoutAddresses addressless_slots_backing; + + using TensorAccessorVariant = + std::variant>; + for (auto const &slot_tensor : slots_backing) { + TensorAccessorVariant accessor_variant = slot_tensor.second; + std::visit( + overload{ + [&](GenericTensorAccessorW const &accessor) { + addressless_slots_backing.insert( + {slot_tensor.first, get_shape_and_datatype(accessor)}); + }, + [&](std::vector const &variadic_accessor) { + std::vector> + variadic_addressless_accessor = + transform(variadic_accessor, + [](GenericTensorAccessorW const &accessor) { + return get_shape_and_datatype(accessor); + }); + addressless_slots_backing.insert( + {slot_tensor.first, variadic_addressless_accessor}); + }}, + accessor_variant); + } + return addressless_slots_backing; +} + +size_t RealmTaskArgumentAccessor::get_device_idx() const { + return 0; +} + +} // namespace FlexFlow diff --git a/lib/realm-backend/src/realm_tensor_backing.cc b/lib/realm-backend/src/realm_tensor_backing.cc new file mode 100644 index 0000000000..6edf6cf064 --- /dev/null +++ b/lib/realm-backend/src/realm_tensor_backing.cc @@ -0,0 +1,127 @@ +#include "realm-backend/realm_tensor_backing.h" +#include "local-execution/tensor_lowering.h" +#include "op-attrs/parallel_tensor_shape.h" +#include "op-attrs/tensor_shape.dtg.h" +#include "pcg/computation_graph.h" +#include "realm-backend/realm_allocator.h" +#include "utils/containers/contains_key.h" +#include "utils/overload.h" +#include "local-execution/slot_grad_id.dtg.h" + +namespace FlexFlow { + +RealmTensorBacking::RealmTensorBacking() {}; + +void RealmTensorBacking::allocate_layer_tensors( + layer_guid_t const &layer_guid, + ComputationGraph const &computation_graph, + RealmAllocator &allocator) { + this->allocate_tensors_by_role( + TensorRole::INPUT, layer_guid, computation_graph, allocator); + this->allocate_tensors_by_role( + TensorRole::WEIGHT, layer_guid, computation_graph, allocator); + this->allocate_tensors_by_role( + TensorRole::OUTPUT, layer_guid, computation_graph, allocator); +} + +void RealmTensorBacking::allocate_tensors_by_role( + TensorRole const &role, + layer_guid_t const &layer_guid, + ComputationGraph const &computation_graph, + RealmAllocator &allocator) { + std::vector tensors; + switch (role) { + case TensorRole::INPUT: + tensors = get_incoming_inputs(computation_graph, layer_guid); + break; + case TensorRole::WEIGHT: + tensors = get_incoming_weights(computation_graph, layer_guid); + break; + case TensorRole::OUTPUT: + tensors = get_outgoing_tensors(computation_graph, layer_guid); + break; + default: + throw mk_runtime_error("Invalid tensor role, got {}", role); + } + + for (tensor_guid_t const &tensor : tensors) { + TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor); + // tensor allocation + if (!contains_key(this->tensor_lowering_mapping, tensor)) { + lowered_tensor_t reduced_tensor = this->lowered_tensor_source.new_lowered_tensor(); + this->tensor_lowering_mapping.insert({tensor, reduced_tensor}); + RealmRegion region = allocator.allocate(get_size_in_bytes(tensor_attrs.shape)); + this->tensor_regions.insert({reduced_tensor, region}); + this->tensor_shapes.insert({reduced_tensor, tensor_attrs.shape}); + } + + // gradient tensor allocation + if (tensor_attrs.create_gradients == CreateGrad::YES && !contains_key(this->gradient_tensor_lowering_mapping, tensor)) { + lowered_tensor_t reduced_tensor = this->lowered_tensor_source.new_lowered_tensor(); + this->gradient_tensor_lowering_mapping.insert({tensor, reduced_tensor}); + RealmRegion region = allocator.allocate(get_size_in_bytes(tensor_attrs.shape)); + this->tensor_regions.insert({reduced_tensor, region}); + this->tensor_shapes.insert({reduced_tensor, tensor_attrs.shape}); + } + } +} + +void RealmTensorBacking::allocate_optimizer_tensors( + tensor_guid_t const &weight, + std::vector const& optimizer_tensors, + RealmAllocator &allocator) { + GenericTensorAccessorW weight_backing = this->get_tensor_backing(this->tensor_lowering_mapping.at(weight)); + for (optimizer_tensor_t const & optimizer_tensor: optimizer_tensors) { + // optimizer tensor allocation + if (!contains_key(this->optimizer_tensor_lowering_mapping, optimizer_tensor)) { + lowered_tensor_t buffer_tensor = this->lowered_tensor_source.new_lowered_tensor(); + this->optimizer_tensor_lowering_mapping.insert({optimizer_tensor, buffer_tensor}); + TensorShape tensor_shape = get_tensor_shape(weight_backing.shape, weight_backing.data_type); + RealmRegion region = allocator.allocate(get_size_in_bytes(tensor_shape)); + this->tensor_regions.insert({buffer_tensor, region}); + this->tensor_shapes.insert({buffer_tensor, tensor_shape}); + } + } +} + +bool RealmTensorBacking::is_tensor_allocated(lowered_tensor_t const & tensor_id) const { + return contains_key(tensor_regions, tensor_id); +} + +GenericTensorAccessorW const &RealmTensorBacking::get_tensor_backing( + lowered_tensor_t const &tensor_id) const { + void *ptr = this->tensor_regions.at(tensor_id).instance.pointer_untyped(0, 0); + TensorShape shape = this->tensor_shapes.at(tensor_id); + return {shape.data_type, ArrayShape{shape}, ptr}; +} + +TensorSlotsBacking RealmTensorBacking::construct_tensor_slots_backing( + TaskBinding const &binding) const { + TensorSlotsBacking mapping; + + for (auto const &tensor_binding : binding.get_tensor_bindings()) { + SlotTensorTypeId slot_tensor_type_id = tensor_binding.first; + + lowered_tensor_t tensor_id = [&] { + TensorTypeVariant tensor_type = tensor_binding.second; + if (tensor_type.has() and slot_tensor_type_id.tensor_type == TensorType::FORWARD) { + return this->tensor_lowering_mapping.at(tensor_type.get()); + } else if (tensor_type.has() and slot_tensor_type_id.tensor_type == TensorType::GRADIENT) { + return this->gradient_tensor_lowering_mapping.at(tensor_type.get()); + } else if (tensor_type.has()) { + return this->optimizer_tensor_lowering_mapping.at(tensor_type.get()); + } else if (tensor_type.has()) { + return this->loss_tensor_lowering_mapping.at(tensor_type.get()); + } else { + throw mk_runtime_error(fmt::format("Tensor binding has invalid type")); + } + }(); + + GenericTensorAccessorW accessor = this->get_tensor_backing(tensor_id); + mapping.insert({slot_tensor_type_id, accessor}); + } + + return mapping; +} + +} // namespace FlexFlow diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc index 46efb17bc1..24829a77b1 100644 --- a/lib/realm-backend/src/realm_training_backing.cc +++ b/lib/realm-backend/src/realm_training_backing.cc @@ -1,11 +1,14 @@ -#include "realm-backend/realm_training_backing.h" #include "local-execution/loss_functions.h" #include "local-execution/optimizer.h" +#include "local-execution/task_id_t.dtg.h" #include "local-execution/task_invocation.h" #include "local-execution/task_signature_impl.h" #include "local-execution/tensor_lowering.h" #include "pcg/computation_graph.h" #include "pcg/optimizer_attrs.h" +#include "realm-backend/realm_training_backing.h" +#include "realm-backend/task_result.h" +#include "realm-backend/task_wrapper.h" #include "utils/containers/contains.h" #include "utils/containers/contains_key.h" #include "utils/containers/get_only.h" @@ -14,24 +17,47 @@ namespace FlexFlow { -LocalTrainingBacking::LocalTrainingBacking( - Allocator const &allocator, +using namespace Realm; + +RealmTrainingBacking::RealmTrainingBacking( ComputationGraph const &computation_graph, - RuntimeArgConfig const &runtime_arg_config) - : allocator(allocator), computation_graph(computation_graph), - local_args_backing(runtime_arg_config), - task_registry(empty_task_registry()) {}; + RuntimeArgConfig const &runtime_arg_config, Realm::Processor master_proc) + : computation_graph(computation_graph), + realm_args_backing(runtime_arg_config), + task_registry(empty_task_registry()) { + master_proc = master_proc; + proc_events.insert({master_proc, Realm::Event::NO_EVENT}); + master_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::SYSTEM_MEM) + .best_affinity_to(master_proc) + .first(); + Machine::ProcessorQuery pq = Machine::ProcessorQuery(Machine::get_machine()) + .only_kind(Processor::TOC_PROC); + for (Processor p : pq) { + worker_procs.push_back(p); + proc_events.insert({p, Realm::Event::NO_EVENT}); + allocators.push_back(RealmAllocator(p)); + } + assert(worker_procs.size() > 0); +} -void LocalTrainingBacking::register_and_allocate_layer( +void RealmTrainingBacking::register_and_allocate_layer( layer_guid_t const &node) { ComputationGraphOpAttrs attrs = get_layer_attrs(this->computation_graph, node).attrs; - this->local_tensor_backing.allocate_layer_tensors( - node, this->computation_graph, this->allocator); + this->realm_tensor_backing.allocate_layer_tensors( + node, this->computation_graph, this->allocators[0]); register_tasks_for_layer(this->task_registry, node, attrs); + // TODO: multi gpu launching + std::vector task_ids = get_task_ids(attrs); + for (task_id_t task_id : task_ids) { + TaskSignatureAndImpl task_signature_impl = + this->task_registry.task_mapping.at(task_id); + register_wrapper_tasks(worker_procs[0], task_id, task_signature_impl); + } } -void LocalTrainingBacking::allocate_layer_optimizer_tensors( +void RealmTrainingBacking::allocate_layer_optimizer_tensors( layer_guid_t const &node, OptimizerAttrs const &optimizer_attrs) { ComputationGraphOpAttrs attrs = get_layer_attrs(this->computation_graph, node).attrs; @@ -41,165 +67,222 @@ void LocalTrainingBacking::allocate_layer_optimizer_tensors( get_only(get_outgoing_tensors(this->computation_graph, node)); std::vector optimizer_tensors; - for (TensorTypeSlotSpec const & tensor_type_slot_spec: values(sig.tensor_guid_slots)) { - optimizer_tensors.push_back(this->optimizer_tensor_source.new_optimizer_tensor()); + for (TensorTypeSlotSpec const &tensor_type_slot_spec : + values(sig.tensor_guid_slots)) { + optimizer_tensors.push_back( + this->optimizer_tensor_source.new_optimizer_tensor()); } this->layer_optimizer_tensor_ids.insert({node, optimizer_tensors}); - this->local_tensor_backing.allocate_optimizer_tensors( - weight_tensor, optimizer_tensors, this->allocator); + this->realm_tensor_backing.allocate_optimizer_tensors( + weight_tensor, optimizer_tensors, this->allocators[0]); } } -DeviceSpecificDeviceStates - LocalTrainingBacking::call_init_task_impl(task_id_t task_id, - TaskArgumentAccessor const &acc) { - TaskSignatureAndImpl task_sig_impl = - this->task_registry.task_mapping.at(task_id); - auto fn = - task_sig_impl.impl_function.get().function_ptr; - return fn(acc); -} - -std::optional - LocalTrainingBacking::call_task_impl(task_id_t task_id, - TaskArgumentAccessor acc) { - TaskSignatureAndImpl task_sig_impl = - this->task_registry.task_mapping.at(task_id); - auto fn = - task_sig_impl.impl_function.get().function_ptr; - return fn(acc); -} - -void LocalTrainingBacking::execute_init(layer_guid_t const &operator_node) { - if (registry_contains_task_for_layer( - this->task_registry, operator_node, OpTaskType::INIT)) { +void RealmTrainingBacking::execute_init(layer_guid_t const &operator_node) { + if (registry_contains_task_for_layer(this->task_registry, operator_node, + OpTaskType::INIT)) { ComputationGraphOpAttrs attrs = get_layer_attrs(this->computation_graph, operator_node).attrs; - - TaskInvocation invocation = this->lower_to_task_invocation(init(attrs)); - TaskArgumentAccessor accessor = - this->get_task_arg_accessor(invocation); - DeviceSpecificDeviceStates device_state = - this->call_init_task_impl(invocation.task_id, accessor); - this->local_args_backing.add_per_device_op_state(operator_node, - device_state); + TaskInvocation invocation = + this->lower_to_task_invocation(init(attrs), operator_node); + TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation); + task_id_t task_id = invocation.task_id; + TaskImplFunction impl_function = + this->task_registry.task_mapping.at(task_id).impl_function; + // TODO: multi gpu launching + Promise promise(master_mem); + Future future = promise.get_future(); + RealmTaskArgs args{ + task_id, impl_function, accessor, std::move(promise)}; + Event e = worker_procs[0].spawn(static_cast(task_id), + &args, sizeof(args), + proc_events[worker_procs[0]]); + proc_events[worker_procs[0]] = e; + future.set_event(e); + this->realm_args_backing.add_per_device_op_state(operator_node, + std::move(future.get())); } } -std::optional - LocalTrainingBacking::execute_forward(layer_guid_t const &operator_node) { - if (registry_contains_task_for_layer( - this->task_registry, operator_node, OpTaskType::FWD)) { +Future> +RealmTrainingBacking::execute_forward(layer_guid_t const &operator_node) { + if (registry_contains_task_for_layer(this->task_registry, operator_node, + OpTaskType::FWD)) { ComputationGraphOpAttrs attrs = get_layer_attrs(this->computation_graph, operator_node).attrs; - - TaskInvocation invocation = this->lower_to_task_invocation(forward(attrs)); - TaskArgumentAccessor accessor = - this->get_task_arg_accessor(invocation); - return this->call_task_impl(invocation.task_id, accessor); + TaskInvocation invocation = + this->lower_to_task_invocation(forward(attrs), operator_node); + TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation); + task_id_t task_id = invocation.task_id; + TaskImplFunction impl_function = + this->task_registry.task_mapping.at(task_id).impl_function; + // TODO: multi gpu launching + Promise> promise(master_mem); + Future> future = promise.get_future(); + RealmTaskArgs> args{task_id, impl_function, accessor, + std::move(promise)}; + Event e = worker_procs[0].spawn(static_cast(task_id), + &args, sizeof(args), + proc_events[worker_procs[0]]); + proc_events[worker_procs[0]] = e; + future.set_event(e); + return future; } else { - return std::nullopt; + return Future>(std::nullopt); } } -void LocalTrainingBacking::compute_loss(LossAttrs const &loss_attrs, - tensor_guid_t const &logit_tensor, - loss_tensor_t const &label_tensor) { - TaskInvocation loss_invocation = - backward(loss_attrs, logit_tensor, label_tensor); - // TODO: https://github.com/flexflow/flexflow-train/issues/1442 - // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation)); - TaskArgumentAccessor loss_accessor = - this->get_task_arg_accessor(loss_invocation); - TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl(); - loss_impl_fn.get().function_ptr(loss_accessor); -} - -std::optional - LocalTrainingBacking::execute_backward(layer_guid_t const &operator_node) { - if (registry_contains_task_for_layer( - this->task_registry, operator_node, OpTaskType::BWD)) { +Future> +RealmTrainingBacking::execute_backward(layer_guid_t const &operator_node) { + if (registry_contains_task_for_layer(this->task_registry, operator_node, + OpTaskType::BWD)) { ComputationGraphOpAttrs attrs = get_layer_attrs(this->computation_graph, operator_node).attrs; - - TaskInvocation invocation = this->lower_to_task_invocation(backward(attrs)); - TaskArgumentAccessor accessor = - this->get_task_arg_accessor(invocation); - return this->call_task_impl(invocation.task_id, accessor); + TaskInvocation invocation = + this->lower_to_task_invocation(backward(attrs), operator_node); + TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation); + task_id_t task_id = invocation.task_id; + TaskImplFunction impl_function = + this->task_registry.task_mapping.at(task_id).impl_function; + // TODO: multi gpu launching + Promise> promise(master_mem); + Future> future = promise.get_future(); + RealmTaskArgs> args{task_id, impl_function, accessor, + std::move(promise)}; + Event e = worker_procs[0].spawn(static_cast(task_id), + &args, sizeof(args), + proc_events[worker_procs[0]]); + proc_events[worker_procs[0]] = e; + future.set_event(e); + return future; } else { - return std::nullopt; + return Future>(std::nullopt); } } -void LocalTrainingBacking::execute_update( - layer_guid_t const &node, OptimizerAttrs const &optimizer_attrs) { +Future +RealmTrainingBacking::execute_update(layer_guid_t const &node, + OptimizerAttrs const &optimizer_attrs) { LayerAttrs layer_attrs = get_layer_attrs(this->computation_graph, node); if (layer_attrs.attrs.has()) { // get tensors - tensor_guid_t weight_tensor = get_only(get_outgoing_tensors(this->computation_graph, node)); - std::vector optimizer_buffer_tensors = this->layer_optimizer_tensor_ids.at(node); - + tensor_guid_t weight_tensor = + get_only(get_outgoing_tensors(this->computation_graph, node)); + std::vector optimizer_buffer_tensors = + this->layer_optimizer_tensor_ids.at(node); // get invocation TaskInvocation invocation = get_update_invocation( optimizer_attrs, weight_tensor, optimizer_buffer_tensors); - // TODO: https://github.com/flexflow/flexflow-train/issues/1442 // assert(is_invocation_valid(get_update_signature(attrs), invocation)); - - // execute update - TaskArgumentAccessor accessor = - this->get_task_arg_accessor(invocation); + TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation); + task_id_t task_id = invocation.task_id; + register_wrapper_tasks_generic(worker_procs[0], task_id); TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs); - update_impl_fn.get().function_ptr(accessor); + // TODO: multi gpu launching + Promise promise; + Future future = promise.get_future(); + RealmTaskArgs args{task_id, update_impl_fn, accessor, + std::move(promise)}; + Event e = worker_procs[0].spawn(static_cast(task_id), + &args, sizeof(args), + proc_events[worker_procs[0]]); + proc_events[worker_procs[0]] = e; + future.set_event(e); + return future; + } else { + return Future(); } } -TaskArgumentAccessor LocalTrainingBacking::get_task_arg_accessor( +Future +RealmTrainingBacking::compute_loss(LossAttrs const &loss_attrs, + tensor_guid_t const &logit_tensor, + loss_tensor_t const &label_tensor) { + TaskInvocation loss_invocation = + backward(loss_attrs, logit_tensor, label_tensor); + // TODO: https://github.com/flexflow/flexflow-train/issues/1442 + // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation)); + TaskArgumentAccessor loss_accessor = + this->get_task_arg_accessor(loss_invocation); + task_id_t task_id = loss_invocation.task_id; + register_wrapper_tasks_generic(worker_procs[0], task_id); + TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl(); + // TODO: multi gpu launching + Promise promise; + Future future = promise.get_future(); + RealmTaskArgs args{task_id, loss_impl_fn, loss_accessor, + std::move(promise)}; + Event e = + worker_procs[0].spawn(static_cast(task_id), &args, + sizeof(args), proc_events[worker_procs[0]]); + proc_events[worker_procs[0]] = e; + future.set_event(e); + return future; +} + +TaskArgumentAccessor RealmTrainingBacking::get_task_arg_accessor( TaskInvocation const &invocation) const { TensorSlotsBacking tensor_slots_backing = - this->local_tensor_backing.construct_tensor_slots_backing( + this->realm_tensor_backing.construct_tensor_slots_backing( invocation.binding); ArgSlotsBacking arg_slots_backing = - this->local_args_backing.construct_arg_slots_backing(invocation.binding); - return TaskArgumentAccessor::create( - this->allocator, tensor_slots_backing, arg_slots_backing); + this->realm_args_backing.construct_arg_slots_backing(invocation.binding); + return TaskArgumentAccessor::create( + this->allocators[0], tensor_slots_backing, arg_slots_backing); } -TaskInvocation LocalTrainingBacking::lower_to_task_invocation(OpTaskInvocation const & op_task_invocation, layer_guid_t const & layer_guid) const { +TaskInvocation RealmTrainingBacking::lower_to_task_invocation( + OpTaskInvocation const &op_task_invocation, + layer_guid_t const &layer_guid) const { TaskBinding binding; // tensors - for (auto const & tensor_binding: op_task_invocation.binding.get_tensor_bindings()) { - tensor_guid_t tensor_to_bind = [&] { + for (auto const &tensor_binding : + op_task_invocation.binding.get_tensor_bindings()) { + tensor_guid_t tensor_to_bind = [&]() -> tensor_guid_t { switch (tensor_binding.second.role) { - case TensorRole::INPUT: - return get_incoming_inputs(this->computation_graph, layer_guid).at(tensor_binding.second.idx); - case TensorRole::OUTPUT: - return get_outgoing_tensors(this->computation_graph, layer_guid).at(tensor_binding.second.idx); - case TensorRole::WEIGHT: - return get_incoming_weights(this->computation_graph, layer_guid).at(tensor_binding.second.idx); - default: - throw mk_runtime_error(fmt::format("Invalid tensor role {}", tensor_binding.second.role)); + case TensorRole::INPUT: + return get_incoming_inputs(this->computation_graph, layer_guid) + .at(tensor_binding.second.idx); + case TensorRole::OUTPUT: + return get_outgoing_tensors(this->computation_graph, layer_guid) + .at(tensor_binding.second.idx); + case TensorRole::WEIGHT: + return get_incoming_weights(this->computation_graph, layer_guid) + .at(tensor_binding.second.idx); + default: + throw mk_runtime_error( + fmt::format("Invalid tensor role {}", tensor_binding.second.role)); } - }(); + }(); if (tensor_binding.first.is_grad == IsGrad::NO) { binding.bind(tensor_binding.first.slot_id, tensor_to_bind); } else if (tensor_binding.first.is_grad == IsGrad::YES) { binding.bind_grad(tensor_binding.first.slot_id, tensor_to_bind); } else { - throw mk_runtime_error(fmt::format("Invalid value for IsGrad {}", tensor_binding.first.is_grad)); + throw mk_runtime_error(fmt::format("Invalid value for IsGrad {}", + tensor_binding.first.is_grad)); } } // args - for (auto const & arg_binding: op_task_invocation.binding.get_arg_bindings()) { + for (auto const &arg_binding : + op_task_invocation.binding.get_arg_bindings()) { if (arg_binding.second.has()) { - ConcreteArgSpec concrete_arg = this->local_args_backing.lower_to_concrete_arg_spec(arg_binding.second.get(), this->computation_graph, layer_guid); + ConcreteArgSpec concrete_arg = + this->realm_args_backing.lower_to_concrete_arg_spec( + arg_binding.second.get(), this->computation_graph, + layer_guid); binding.insert_arg_spec(arg_binding.first, TaskArgSpec{concrete_arg}); } else if (arg_binding.second.has()) { - binding.insert_arg_spec(arg_binding.first, TaskArgSpec{arg_binding.second.get()}); + binding.insert_arg_spec( + arg_binding.first, + TaskArgSpec{arg_binding.second.get()}); } else { - binding.insert_arg_spec(arg_binding.first, TaskArgSpec{arg_binding.second.get()}); + binding.insert_arg_spec( + arg_binding.first, + TaskArgSpec{arg_binding.second.get()}); } } From 8efaec7f2590bc4b8613c9f742910119d67df71a Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 5 Feb 2025 17:15:22 -0800 Subject: [PATCH 36/91] Build --- lib/kernels/include/kernels/array_shape.h | 5 +-- lib/kernels/include/kernels/legion_dim.h | 2 + lib/kernels/src/allocation.cc | 1 - lib/kernels/src/array_shape.cc | 38 +++++++--------- lib/kernels/src/cuda/ops/concat_kernels.cu | 3 +- lib/kernels/src/legion_dim.cc | 6 +++ .../src/local-execution/ops/transpose.cc | 28 +----------- .../src/local_cost_estimator.cc | 2 +- lib/local-execution/src/loss_functions.cc | 45 ++++++++++--------- lib/local-execution/src/optimizer.cc | 16 +++---- lib/local-execution/src/task_registry.cc | 6 +-- 11 files changed, 62 insertions(+), 90 deletions(-) diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h index 09e53582ea..95d20ceca3 100644 --- a/lib/kernels/include/kernels/array_shape.h +++ b/lib/kernels/include/kernels/array_shape.h @@ -18,7 +18,7 @@ struct ArrayShape { explicit ArrayShape(nonnegative_int *dims, nonnegative_int num_dims); explicit ArrayShape(TensorShape const &shape); explicit ArrayShape(std::vector const &); - explicit ArrayShape(LegionTensorDims const &); + explicit ArrayShape(LegionOrdered const &); /** * @brief Alias of ArrayShape::num_elements for compatibility with @@ -53,9 +53,6 @@ struct ArrayShape { ArrayShape sub_shape(std::optional start, std::optional end) const; - bool operator==(ArrayShape const &) const; - bool operator!=(ArrayShape const &) const; - public: LegionOrdered dims; diff --git a/lib/kernels/include/kernels/legion_dim.h b/lib/kernels/include/kernels/legion_dim.h index 7b9b9c455c..afab2d00b6 100644 --- a/lib/kernels/include/kernels/legion_dim.h +++ b/lib/kernels/include/kernels/legion_dim.h @@ -10,6 +10,8 @@ legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value); legion_dim_t legion_dim_from_ff_dim(ff_dim_t, nonnegative_int num_dimensions); +ff_dim_t ff_dim_from_legion_dim(legion_dim_t, nonnegative_int num_dimensions); + template using LegionOrdered = DimOrdered; diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc index cdc76371c8..114f817215 100644 --- a/lib/kernels/src/allocation.cc +++ b/lib/kernels/src/allocation.cc @@ -13,7 +13,6 @@ void Allocator::deallocate(void *ptr) { GenericTensorAccessorW Allocator::allocate_tensor(TensorShape const &tensor_shape) { - return {tensor_shape.data_type, ArrayShape{tensor_shape}, ptr}; void *ptr = this->allocate(get_size_in_bytes(tensor_shape).unwrap_nonnegative()); return {tensor_shape.data_type, ArrayShape{tensor_shape}, ptr}; diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index fc35f47f3f..ea946b2882 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -22,7 +22,7 @@ ArrayShape::ArrayShape(TensorShape const &shape) ArrayShape::ArrayShape(std::vector const &input_dims) : dims(input_dims) {} -ArrayShape::ArrayShape(LegionTensorDims const &legion_tensor_dims) +ArrayShape::ArrayShape(LegionOrdered const &legion_tensor_dims) : dims(legion_tensor_dims) {} nonnegative_int ArrayShape::get_volume() const { @@ -58,23 +58,23 @@ nonnegative_int ArrayShape::at(ff_dim_t idx) const { ArrayShape ArrayShape::sub_shape(std::optional start, std::optional end) const { - std::optional legion_start = + return ArrayShape{legion_ordered_from_ff_ordered(slice(ff_ordered_from_legion_ordered(this->dims), start, end))}; +} + +ArrayShape ArrayShape::sub_shape(std::optional start, + std::optional end) const { + std::optional legion_start = transform(start, [&](auto const &start_unwrapped) { - return legion_dim_from_ff_dim(start_unwrapped, num_dims()); + return ff_dim_from_legion_dim(start_unwrapped, num_dims()); }); - std::optional legion_end = + std::optional legion_end = transform(end, [&](auto const &end_unwrapped) { - return legion_dim_from_ff_dim(end_unwrapped, num_dims()); + return ff_dim_from_legion_dim(end_unwrapped, num_dims()); }); return this->sub_shape(legion_start, legion_end); } -ArrayShape ArrayShape::sub_shape(std::optional start, - std::optional end) const { - return ArrayShape{slice(this->dims, start, end)}; -} - bool ArrayShape::operator==(ArrayShape const &other) const { return this->tie() == other.tie(); } @@ -83,11 +83,11 @@ bool ArrayShape::operator!=(ArrayShape const &other) const { return this->tie() != other.tie(); } -ArrayShape ArrayShape::sub_shape( - std::optional> start, - std::optional> end) const { - NOT_IMPLEMENTED(); -} +// ArrayShape ArrayShape::sub_shape( +// std::optional> start, +// std::optional> end) const { +// NOT_IMPLEMENTED(); +// } std::optional ArrayShape::at_maybe(legion_dim_t index) const { if (index.value < dims.size()) { @@ -114,14 +114,6 @@ TensorShape get_tensor_shape(ArrayShape const &shape, DataType dtype) { dtype}; } -bool ArrayShape::operator==(ArrayShape const &other) const { - return this->dims == other.dims; -} - -bool ArrayShape::operator!=(ArrayShape const &other) const { - return this->dims != other.dims; -} - std::string format_as(ArrayShape const &x) { std::ostringstream oss; oss << " namespace FlexFlow { @@ -27,7 +28,7 @@ void calc_blk_size(size_t &num_blocks, ArrayShape const &shape, ff_dim_t axis) { legion_dim_t axis_legion_dim = legion_dim_from_ff_dim(axis, shape.num_dims()); - blk_size = shape.sub_shape(legion_dim_t{0}, axis_legion_dim).num_elements().unwrap_nonnegative(); + blk_size = shape.sub_shape(legion_dim_t{nonnegative_int{0}}, axis_legion_dim).num_elements().unwrap_nonnegative(); num_blocks = shape.sub_shape(axis, std::nullopt).num_elements().unwrap_nonnegative(); } diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/legion_dim.cc index bbb15c5636..f89dd34d98 100644 --- a/lib/kernels/src/legion_dim.cc +++ b/lib/kernels/src/legion_dim.cc @@ -13,4 +13,10 @@ legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim, ff_dim.value.unwrap_nonnegative() - 1}}; } +ff_dim_t legion_dim_from_ff_dim(legion_dim_t legion_dim, + nonnegative_int num_dimensions) { + return ff_dim_t{nonnegative_int{num_dimensions.unwrap_nonnegative() - + legion_dim.value.unwrap_nonnegative() - 1}}; +} + } // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/transpose.cc b/lib/local-execution/src/local-execution/ops/transpose.cc index a7647ebd18..bbe63bbed3 100644 --- a/lib/local-execution/src/local-execution/ops/transpose.cc +++ b/lib/local-execution/src/local-execution/ops/transpose.cc @@ -28,24 +28,8 @@ enum Slots { OUTPUT, // tensor ATTRS, PROFILING, - PER_DEVICE_STATE, }; -OpTaskInvocation init(TransposeAttrs const &attrs) { - OpTaskBinding binding; - binding.bind_arg(ATTRS, attrs); - return {task_id_t::TRANSPOSE_INIT_TASK_ID, binding}; -} - -static DeviceSpecificDeviceStates - init_task_impl(TaskArgumentAccessor const &acc) { - auto const &attrs = acc.get_argument(ATTRS); - std::vector perm = inner_to_outer_idxs(attrs.perm); - TransposePerDeviceState per_device_state = init_kernel(perm.size(), perm); - - return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; -} OpTaskInvocation forward(TransposeAttrs const &attrs) { OpTaskBinding binding; @@ -95,9 +79,6 @@ OpTaskInvocation backward(TransposeAttrs const &attrs) { return {task_id_t::TRANSPOSE_BWD_TASK_ID, binding}; } -TaskImplFunction get_transpose_init_task_impl() { - return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; -} TaskImplFunction get_transpose_fwd_task_impl() { return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; @@ -107,13 +88,6 @@ TaskImplFunction get_transpose_bwd_task_impl() { return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } -OpTaskSignature get_transpose_init_signature() { - OpTaskSignature init(OpTaskType::INIT); - - init.add_arg_slot(ATTRS); - init.add_return_value(); - return init; -} OpTaskSignature get_transpose_fwd_signature() { OpTaskSignature fwd(OpTaskType::FWD); @@ -131,7 +105,7 @@ OpTaskSignature get_transpose_bwd_signature() { } std::vector get_task_ids(TransposeAttrs const &) { - return {task_id_t::TRANSPOSE_INIT_TASK_ID, task_id_t::TRANSPOSE_FWD_TASK_ID, task_id_t::TRANSPOSE_BWD_TASK_ID}; + return {task_id_t::TRANSPOSE_FWD_TASK_ID, task_id_t::TRANSPOSE_BWD_TASK_ID}; } } // namespace FlexFlow diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc index ef01783eb7..12c8031654 100644 --- a/lib/local-execution/src/local_cost_estimator.cc +++ b/lib/local-execution/src/local_cost_estimator.cc @@ -19,7 +19,7 @@ namespace FlexFlow { LocalCostEstimator::LocalCostEstimator(RuntimeArgConfig const &config) : runtime_arg_config(config) {} -static ComputationGraph const & +static ComputationGraph create_computation_graph_for_local_cost_estimation( PCGOperatorAttrs const &op, std::vector const &inputs, diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc index 93a792c466..a7fe68c995 100644 --- a/lib/local-execution/src/loss_functions.cc +++ b/lib/local-execution/src/loss_functions.cc @@ -17,6 +17,7 @@ #include "kernels/loss_function_kernels.h" #include "local-execution/loss_functions.h" #include "local-execution/profiling.h" +#include "utils/nonnegative_int/nonnegative_int.h" namespace FlexFlow { @@ -54,35 +55,35 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { auto logit_grad = acc.get_tensor_grad(LOGIT_GRAD); auto logit = acc.get_tensor(LOGIT); auto label = acc.get_loss_tensor(LABEL); - int batch_size = logit.shape.at(legion_dim_t{1}); + int batch_size = logit.shape.at(legion_dim_t{nonnegative_int{1}}).unwrap_nonnegative(); // assuming logit shape is [batch dim, num classes] LossFunction loss_type = get_loss_function(attrs); float scale_factor = 1.0f / batch_size; if (loss_type == LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE) { assert(logit.shape.get_volume() == label.shape.get_volume()); - scale_factor = 2.0f / logit.shape.get_volume(); + scale_factor = 2.0f / logit.shape.get_volume().unwrap_nonnegative(); } if (loss_type == LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY) { // label shape is [batch dim, 1] auto scce_attrs = attrs.get(); - size_t ndim = logit.shape.num_dims(); - int num_classes = logit.shape.at(legion_dim_t{0}); + size_t ndim = logit.shape.num_dims().unwrap_nonnegative(); + int num_classes = logit.shape.at(legion_dim_t{nonnegative_int{0}}).unwrap_nonnegative(); assert(logit_grad.shape == logit.shape); int k = 1; if (scce_attrs.replace_labels) { - k = logit.shape.at(legion_dim_t(ndim - 1)) / + k = logit.shape.at(legion_dim_t(nonnegative_int{ndim - 1})).unwrap_nonnegative() / label.shape.at(legion_dim_t( - ndim - 1)); // TODO FIXME something seems wrong here, isn't the + nonnegative_int{ndim - 1})).unwrap_nonnegative(); // TODO FIXME something seems wrong here, isn't the // numerator guaranteed to be 1? <--- this is not the // case because of the potential parallel dim } - assert(label.shape.sub_shape(legion_dim_t(1), std::nullopt) == - logit.shape.sub_shape(legion_dim_t(1), std::nullopt)); - assert(k * label.shape.at(legion_dim_t(ndim - 1)) == - logit.shape.at(legion_dim_t(ndim - 1))); - assert(label.shape.at(legion_dim_t(0)) == 1); + assert(label.shape.sub_shape(legion_dim_t(nonnegative_int{1}), std::nullopt) == + logit.shape.sub_shape(legion_dim_t(nonnegative_int{1}), std::nullopt)); + assert(k * label.shape.at(legion_dim_t(nonnegative_int{ndim - 1})).unwrap_nonnegative() == + logit.shape.at(legion_dim_t(nonnegative_int{ndim - 1})).unwrap_nonnegative()); + assert(label.shape.at(legion_dim_t(nonnegative_int{0})).unwrap_nonnegative() == 1); profile(sparse_categorical_crossentropy_loss_backward_kernel, profiling, @@ -90,8 +91,8 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { get_float_ptr(logit_grad), get_float_ptr(logit), reinterpret_cast(get_float_ptr(label)), - get_volume(logit.shape), - get_volume(logit_grad.shape), + get_volume(logit.shape).unwrap_nonnegative(), + get_volume(logit_grad.shape).unwrap_nonnegative(), batch_size, num_classes, k, @@ -99,7 +100,7 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { } else { assert(logit.shape == label.shape); assert(logit_grad.shape == logit.shape); - int num_channels = logit.shape.at(legion_dim_t{0}); + int num_channels = logit.shape.at(legion_dim_t{nonnegative_int{0}}).unwrap_nonnegative(); switch (loss_type) { case LossFunction::CATEGORICAL_CROSSENTROPY: { profile(categorical_crossentropy_loss_backward_kernel, @@ -108,8 +109,8 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { get_float_ptr(logit_grad), get_float_ptr(logit), get_float_ptr(label), - get_volume(logit.shape), - get_volume(logit_grad.shape), + get_volume(logit.shape).unwrap_nonnegative(), + get_volume(logit_grad.shape).unwrap_nonnegative(), scale_factor); break; } @@ -120,8 +121,8 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { get_float_ptr(logit_grad), get_float_ptr(logit), get_float_ptr(label), - get_volume(logit.shape), - get_volume(logit_grad.shape), + get_volume(logit.shape).unwrap_nonnegative(), + get_volume(logit_grad.shape).unwrap_nonnegative(), scale_factor); break; } @@ -131,15 +132,15 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { "[IdentityLoss] backward_time = %.2lfms\n", get_float_ptr(logit_grad), get_float_ptr(logit), - get_volume(logit.shape), - get_volume(logit_grad.shape), + get_volume(logit.shape).unwrap_nonnegative(), + get_volume(logit_grad.shape).unwrap_nonnegative(), scale_factor); break; } default: - throw mk_runtime_error( + throw mk_runtime_error(fmt::format( "Unsupported loss function {}. Please report this as an issue.", - loss_type); + loss_type)); } } } diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc index 0c64147bd8..39c28fe83d 100644 --- a/lib/local-execution/src/optimizer.cc +++ b/lib/local-execution/src/optimizer.cc @@ -59,11 +59,11 @@ static void sgd_update_task_impl(TaskArgumentAccessor const &acc) { auto profiling = acc.get_argument(PROFILING); assert(weight.shape == weight_grad.shape); - size_t size = weight_grad.shape.get_volume(); + int size = weight_grad.shape.get_volume().unwrap_nonnegative(); - assert(weight_grad.shape.get_volume() & weight.shape.get_volume() == 0); - size_t num_replicas = - weight_grad.shape.get_volume() / weight.shape.get_volume(); + assert(weight_grad.shape.get_volume().unwrap_nonnegative() & weight.shape.get_volume().unwrap_nonnegative() == 0); + int num_replicas = + weight_grad.shape.get_volume().unwrap_nonnegative() / weight.shape.get_volume().unwrap_nonnegative(); float *sgd_v_ptr; if (attrs.momentum > 0.0f) { @@ -153,11 +153,11 @@ static void adam_update_task_impl(TaskArgumentAccessor const &acc) { auto profiling = acc.get_argument(PROFILING); assert(weight.shape == weight_grad.shape); - size_t size = weight_grad.shape.get_volume(); + int size = weight_grad.shape.get_volume().unwrap_nonnegative(); - assert(weight_grad.shape.get_volume() % weight.shape.get_volume() == 0); - size_t num_replicas = - weight_grad.shape.get_volume() / weight.shape.get_volume(); + assert(weight_grad.shape.get_volume().unwrap_nonnegative() % weight.shape.get_volume().unwrap_nonnegative() == 0); + int num_replicas = + weight_grad.shape.get_volume().unwrap_nonnegative() / weight.shape.get_volume().unwrap_nonnegative(); if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { auto handle = acc.get_argument(HANDLE); diff --git a/lib/local-execution/src/task_registry.cc b/lib/local-execution/src/task_registry.cc index 9b7b55633c..6e63fc7a1e 100644 --- a/lib/local-execution/src/task_registry.cc +++ b/lib/local-execution/src/task_registry.cc @@ -36,8 +36,8 @@ void register_tasks_for_layer(TaskRegistry &task_registry, task_registry.backward_task_ids[op_id] = task_id; break; default: - throw mk_runtime_error("Invalid OpTaskType, got {}", - task_signature_impl.task_signature.type); + throw mk_runtime_error(fmt::format("Invalid OpTaskType, got {}", + task_signature_impl.task_signature.type)); } task_registry.task_mapping.insert({task_id, task_signature_impl}); } @@ -58,7 +58,7 @@ bool registry_contains_task_for_layer(TaskRegistry const &task_registry, task_ids = task_registry.backward_task_ids; break; default: - throw mk_runtime_error("Invalid OpTaskType, got {}", op_task_type); + throw mk_runtime_error(fmt::format("Invalid OpTaskType, got {}", op_task_type)); } return task_ids.at(op).has_value(); From 1dc1398458c6c330b8aade003e5e114464c9dc1f Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 5 Feb 2025 17:25:16 -0800 Subject: [PATCH 37/91] Format --- lib/kernels/src/array_shape.cc | 5 +-- lib/kernels/src/cuda/ops/concat_kernels.cu | 7 ++-- lib/kernels/src/legion_dim.cc | 4 +-- .../src/local-execution/ops/transpose.cc | 3 -- .../src/local_cost_estimator.cc | 11 +++--- lib/local-execution/src/loss_functions.cc | 36 ++++++++++++------- lib/local-execution/src/optimizer.cc | 15 ++++---- lib/local-execution/src/task_registry.cc | 8 +++-- 8 files changed, 52 insertions(+), 37 deletions(-) diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index ea946b2882..220f8ebeea 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -1,8 +1,8 @@ #include "kernels/array_shape.h" #include "op-attrs/dim_ordered/slice.h" #include "utils/containers/product.h" -#include "utils/containers/transform.h" #include "utils/containers/reversed.h" +#include "utils/containers/transform.h" #include "utils/containers/vector_of.h" #include "utils/nonnegative_int/num_elements.h" @@ -58,7 +58,8 @@ nonnegative_int ArrayShape::at(ff_dim_t idx) const { ArrayShape ArrayShape::sub_shape(std::optional start, std::optional end) const { - return ArrayShape{legion_ordered_from_ff_ordered(slice(ff_ordered_from_legion_ordered(this->dims), start, end))}; + return ArrayShape{legion_ordered_from_ff_ordered( + slice(ff_ordered_from_legion_ordered(this->dims), start, end))}; } ArrayShape ArrayShape::sub_shape(std::optional start, diff --git a/lib/kernels/src/cuda/ops/concat_kernels.cu b/lib/kernels/src/cuda/ops/concat_kernels.cu index 8bb7d996b6..0365764de1 100644 --- a/lib/kernels/src/cuda/ops/concat_kernels.cu +++ b/lib/kernels/src/cuda/ops/concat_kernels.cu @@ -28,8 +28,11 @@ void calc_blk_size(size_t &num_blocks, ArrayShape const &shape, ff_dim_t axis) { legion_dim_t axis_legion_dim = legion_dim_from_ff_dim(axis, shape.num_dims()); - blk_size = shape.sub_shape(legion_dim_t{nonnegative_int{0}}, axis_legion_dim).num_elements().unwrap_nonnegative(); - num_blocks = shape.sub_shape(axis, std::nullopt).num_elements().unwrap_nonnegative(); + blk_size = shape.sub_shape(legion_dim_t{nonnegative_int{0}}, axis_legion_dim) + .num_elements() + .unwrap_nonnegative(); + num_blocks = + shape.sub_shape(axis, std::nullopt).num_elements().unwrap_nonnegative(); } void forward_kernel(cudaStream_t stream, diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/legion_dim.cc index f89dd34d98..23875ad916 100644 --- a/lib/kernels/src/legion_dim.cc +++ b/lib/kernels/src/legion_dim.cc @@ -14,9 +14,9 @@ legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim, } ff_dim_t legion_dim_from_ff_dim(legion_dim_t legion_dim, - nonnegative_int num_dimensions) { + nonnegative_int num_dimensions) { return ff_dim_t{nonnegative_int{num_dimensions.unwrap_nonnegative() - - legion_dim.value.unwrap_nonnegative() - 1}}; + legion_dim.value.unwrap_nonnegative() - 1}}; } } // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/transpose.cc b/lib/local-execution/src/local-execution/ops/transpose.cc index bbe63bbed3..eafde9461e 100644 --- a/lib/local-execution/src/local-execution/ops/transpose.cc +++ b/lib/local-execution/src/local-execution/ops/transpose.cc @@ -30,7 +30,6 @@ enum Slots { PROFILING, }; - OpTaskInvocation forward(TransposeAttrs const &attrs) { OpTaskBinding binding; @@ -79,7 +78,6 @@ OpTaskInvocation backward(TransposeAttrs const &attrs) { return {task_id_t::TRANSPOSE_BWD_TASK_ID, binding}; } - TaskImplFunction get_transpose_fwd_task_impl() { return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } @@ -88,7 +86,6 @@ TaskImplFunction get_transpose_bwd_task_impl() { return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } - OpTaskSignature get_transpose_fwd_signature() { OpTaskSignature fwd(OpTaskType::FWD); diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc index 12c8031654..85789c9505 100644 --- a/lib/local-execution/src/local_cost_estimator.cc +++ b/lib/local-execution/src/local_cost_estimator.cc @@ -19,12 +19,11 @@ namespace FlexFlow { LocalCostEstimator::LocalCostEstimator(RuntimeArgConfig const &config) : runtime_arg_config(config) {} -static ComputationGraph - create_computation_graph_for_local_cost_estimation( - PCGOperatorAttrs const &op, - std::vector const &inputs, - std::vector const &weights, - std::vector const &outputs) { +static ComputationGraph create_computation_graph_for_local_cost_estimation( + PCGOperatorAttrs const &op, + std::vector const &inputs, + std::vector const &weights, + std::vector const &outputs) { ComputationGraph computation_graph = make_empty_computation_graph(); // create layer for inputs diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc index a7fe68c995..32b66629d3 100644 --- a/lib/local-execution/src/loss_functions.cc +++ b/lib/local-execution/src/loss_functions.cc @@ -55,7 +55,8 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { auto logit_grad = acc.get_tensor_grad(LOGIT_GRAD); auto logit = acc.get_tensor(LOGIT); auto label = acc.get_loss_tensor(LABEL); - int batch_size = logit.shape.at(legion_dim_t{nonnegative_int{1}}).unwrap_nonnegative(); + int batch_size = + logit.shape.at(legion_dim_t{nonnegative_int{1}}).unwrap_nonnegative(); // assuming logit shape is [batch dim, num classes] LossFunction loss_type = get_loss_function(attrs); @@ -69,21 +70,29 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { // label shape is [batch dim, 1] auto scce_attrs = attrs.get(); size_t ndim = logit.shape.num_dims().unwrap_nonnegative(); - int num_classes = logit.shape.at(legion_dim_t{nonnegative_int{0}}).unwrap_nonnegative(); + int num_classes = + logit.shape.at(legion_dim_t{nonnegative_int{0}}).unwrap_nonnegative(); assert(logit_grad.shape == logit.shape); int k = 1; if (scce_attrs.replace_labels) { - k = logit.shape.at(legion_dim_t(nonnegative_int{ndim - 1})).unwrap_nonnegative() / - label.shape.at(legion_dim_t( - nonnegative_int{ndim - 1})).unwrap_nonnegative(); // TODO FIXME something seems wrong here, isn't the - // numerator guaranteed to be 1? <--- this is not the - // case because of the potential parallel dim + k = logit.shape.at(legion_dim_t(nonnegative_int{ndim - 1})) + .unwrap_nonnegative() / + label.shape.at(legion_dim_t(nonnegative_int{ndim - 1})) + .unwrap_nonnegative(); // TODO FIXME something seems wrong here, + // isn't the numerator guaranteed to be 1? + // <--- this is not the case because of the + // potential parallel dim } - assert(label.shape.sub_shape(legion_dim_t(nonnegative_int{1}), std::nullopt) == - logit.shape.sub_shape(legion_dim_t(nonnegative_int{1}), std::nullopt)); - assert(k * label.shape.at(legion_dim_t(nonnegative_int{ndim - 1})).unwrap_nonnegative() == - logit.shape.at(legion_dim_t(nonnegative_int{ndim - 1})).unwrap_nonnegative()); - assert(label.shape.at(legion_dim_t(nonnegative_int{0})).unwrap_nonnegative() == 1); + assert( + label.shape.sub_shape(legion_dim_t(nonnegative_int{1}), std::nullopt) == + logit.shape.sub_shape(legion_dim_t(nonnegative_int{1}), std::nullopt)); + assert(k * label.shape.at(legion_dim_t(nonnegative_int{ndim - 1})) + .unwrap_nonnegative() == + logit.shape.at(legion_dim_t(nonnegative_int{ndim - 1})) + .unwrap_nonnegative()); + assert( + label.shape.at(legion_dim_t(nonnegative_int{0})).unwrap_nonnegative() == + 1); profile(sparse_categorical_crossentropy_loss_backward_kernel, profiling, @@ -100,7 +109,8 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { } else { assert(logit.shape == label.shape); assert(logit_grad.shape == logit.shape); - int num_channels = logit.shape.at(legion_dim_t{nonnegative_int{0}}).unwrap_nonnegative(); + int num_channels = + logit.shape.at(legion_dim_t{nonnegative_int{0}}).unwrap_nonnegative(); switch (loss_type) { case LossFunction::CATEGORICAL_CROSSENTROPY: { profile(categorical_crossentropy_loss_backward_kernel, diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc index 39c28fe83d..76da26433d 100644 --- a/lib/local-execution/src/optimizer.cc +++ b/lib/local-execution/src/optimizer.cc @@ -61,9 +61,10 @@ static void sgd_update_task_impl(TaskArgumentAccessor const &acc) { assert(weight.shape == weight_grad.shape); int size = weight_grad.shape.get_volume().unwrap_nonnegative(); - assert(weight_grad.shape.get_volume().unwrap_nonnegative() & weight.shape.get_volume().unwrap_nonnegative() == 0); - int num_replicas = - weight_grad.shape.get_volume().unwrap_nonnegative() / weight.shape.get_volume().unwrap_nonnegative(); + assert(weight_grad.shape.get_volume().unwrap_nonnegative() & + weight.shape.get_volume().unwrap_nonnegative() == 0); + int num_replicas = weight_grad.shape.get_volume().unwrap_nonnegative() / + weight.shape.get_volume().unwrap_nonnegative(); float *sgd_v_ptr; if (attrs.momentum > 0.0f) { @@ -155,9 +156,11 @@ static void adam_update_task_impl(TaskArgumentAccessor const &acc) { assert(weight.shape == weight_grad.shape); int size = weight_grad.shape.get_volume().unwrap_nonnegative(); - assert(weight_grad.shape.get_volume().unwrap_nonnegative() % weight.shape.get_volume().unwrap_nonnegative() == 0); - int num_replicas = - weight_grad.shape.get_volume().unwrap_nonnegative() / weight.shape.get_volume().unwrap_nonnegative(); + assert(weight_grad.shape.get_volume().unwrap_nonnegative() % + weight.shape.get_volume().unwrap_nonnegative() == + 0); + int num_replicas = weight_grad.shape.get_volume().unwrap_nonnegative() / + weight.shape.get_volume().unwrap_nonnegative(); if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { auto handle = acc.get_argument(HANDLE); diff --git a/lib/local-execution/src/task_registry.cc b/lib/local-execution/src/task_registry.cc index 6e63fc7a1e..7b0c80a9bc 100644 --- a/lib/local-execution/src/task_registry.cc +++ b/lib/local-execution/src/task_registry.cc @@ -36,8 +36,9 @@ void register_tasks_for_layer(TaskRegistry &task_registry, task_registry.backward_task_ids[op_id] = task_id; break; default: - throw mk_runtime_error(fmt::format("Invalid OpTaskType, got {}", - task_signature_impl.task_signature.type)); + throw mk_runtime_error( + fmt::format("Invalid OpTaskType, got {}", + task_signature_impl.task_signature.type)); } task_registry.task_mapping.insert({task_id, task_signature_impl}); } @@ -58,7 +59,8 @@ bool registry_contains_task_for_layer(TaskRegistry const &task_registry, task_ids = task_registry.backward_task_ids; break; default: - throw mk_runtime_error(fmt::format("Invalid OpTaskType, got {}", op_task_type)); + throw mk_runtime_error( + fmt::format("Invalid OpTaskType, got {}", op_task_type)); } return task_ids.at(op).has_value(); From 17ad5c8855adf788146be53049a151a9785d84b1 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 5 Feb 2025 18:09:40 -0800 Subject: [PATCH 38/91] Split task spec files --- .proj.toml | 1 + lib/CMakeLists.txt | 1 + lib/local-execution/CMakeLists.txt | 1 + .../generic_task_impl_function.h | 2 +- .../local-execution/gradient_tensor_source.h | 2 +- .../init_op_task_impl_function.h | 2 +- .../local-execution/itask_argument_accessor.h | 6 ++--- .../local-execution/local_args_backing.h | 8 +++---- .../local-execution/local_cost_estimator.h | 2 +- .../local_task_argument_accessor.h | 2 +- .../local-execution/local_tensor_backing.h | 12 +++++----- .../include/local-execution/loss_functions.h | 6 ++--- .../local-execution/loss_tensor_source.h | 2 +- .../local-execution/lowered_tensor_source.h | 2 +- .../local-execution/model_training_instance.h | 2 +- .../include/local-execution/ops/attention.h | 4 ++-- .../local-execution/ops/batch_matmul.h | 6 ++--- .../include/local-execution/ops/batch_norm.h | 4 ++-- .../include/local-execution/ops/cast.h | 4 ++-- .../include/local-execution/ops/combine.h | 4 ++-- .../include/local-execution/ops/concat.h | 4 ++-- .../include/local-execution/ops/conv_2d.h | 4 ++-- .../include/local-execution/ops/dropout.h | 6 ++--- .../local-execution/ops/element_binary.h | 2 +- .../local-execution/ops/element_unary.h | 4 ++-- .../include/local-execution/ops/embedding.h | 4 ++-- .../include/local-execution/ops/flat.h | 3 ++- .../include/local-execution/ops/gather.h | 4 ++-- .../include/local-execution/ops/input.h | 2 +- .../include/local-execution/ops/layer_norm.h | 4 ++-- .../include/local-execution/ops/linear.h | 4 ++-- .../include/local-execution/ops/noop.h | 2 +- .../include/local-execution/ops/pool_2d.h | 4 ++-- .../include/local-execution/ops/reduce.h | 4 ++-- .../include/local-execution/ops/reduction.h | 4 ++-- .../include/local-execution/ops/repartition.h | 4 ++-- .../include/local-execution/ops/replicate.h | 4 ++-- .../include/local-execution/ops/reshape.h | 4 ++-- .../include/local-execution/ops/reverse.h | 4 ++-- .../include/local-execution/ops/softmax.h | 4 ++-- .../include/local-execution/ops/split.h | 4 ++-- .../include/local-execution/ops/topk.h | 4 ++-- .../include/local-execution/ops/transpose.h | 4 ++-- .../include/local-execution/ops/weight.h | 2 +- .../include/local-execution/optimizer.h | 4 ++-- .../local-execution/optimizer_tensor_source.h | 2 +- .../include/local-execution/sim_environment.h | 2 +- .../local-execution/task_argument_accessor.h | 4 ++-- .../include/local-execution/task_registry.h | 2 +- .../local-execution/task_registry.struct.toml | 2 +- .../local-execution/task_signature_impl.h | 4 ++-- .../task_signature_impl.struct.toml | 2 +- .../include/local-execution/tasks.h | 2 +- .../include/local-execution/tensor_lowering.h | 13 ----------- .../src/local-execution/ops/attention.cc | 2 +- .../src/local-execution/ops/batch_matmul.cc | 2 +- .../src/local-execution/ops/cast.cc | 2 +- .../src/local-execution/ops/combine.cc | 2 +- .../src/local-execution/ops/concat.cc | 4 ++-- .../src/local-execution/ops/dropout.cc | 4 ++-- lib/local-execution/src/local_args_backing.cc | 2 +- .../src/local_cost_estimator.cc | 2 +- .../src/local_tensor_backing.cc | 4 ++-- .../src/local_training_backing.cc | 6 ++--- lib/local-execution/src/loss_functions.cc | 2 +- lib/local-execution/src/optimizer.cc | 2 +- lib/local-execution/src/per_device_state.cc | 2 +- lib/local-execution/src/task_binding.cc | 2 +- lib/local-execution/src/tensor_lowering.cc | 10 --------- .../test/src/test_local_slots_backing.cc | 2 +- lib/local-execution/test/src/test_loss_e2e.cc | 2 +- .../test/src/test_update_e2e.cc | 2 +- lib/task-spec/CMakeLists.txt | 16 ++++++++++++++ .../include/task-spec}/arg_ref.h | 2 +- .../include/task-spec}/concrete_arg.h | 2 +- .../include/task-spec}/config.h | 0 .../include/task-spec}/device_specific.h | 2 +- ...device_specific_device_states.variant.toml | 2 +- .../task-spec}/gradient_tensor_t.struct.toml | 0 .../include/task-spec}/is_grad.enum.toml | 0 .../include/task-spec}/is_trainable.enum.toml | 0 .../task-spec}/loss_tensor_t.struct.toml | 0 .../task-spec}/lowered_tensor_t.struct.toml | 0 .../include/task-spec}/op_arg_ref.h | 8 +++---- .../task-spec}/op_arg_ref_type.variant.toml | 4 ++-- .../include/task-spec}/op_arg_spec.h | 2 +- .../task-spec}/op_arg_spec.variant.toml | 6 ++--- .../task-spec}/op_slot_options.enum.toml | 0 .../include/task-spec}/op_task_invocation.h | 22 +++++++++---------- .../include/task-spec}/op_task_signature.h | 14 ++++++------ .../task-spec}/op_task_to_task_invocation.h | 8 +++---- .../include/task-spec}/op_task_type.enum.toml | 0 .../op_tensor_slot_spec.struct.toml | 10 ++++----- .../include/task-spec}/op_tensor_spec.h | 2 +- .../task-spec}/optimizer_tensor_t.struct.toml | 0 ...parallel_tensor_shape_ref_type.struct.toml | 0 .../include/task-spec}/per_device_op_state.h | 4 ++-- .../per_device_op_state.variant.toml | 0 .../per_device_op_state_ref_type.struct.toml | 0 .../include/task-spec}/profiling.h | 0 .../include/task-spec}/runtime_arg_config.h | 4 ++-- .../include/task-spec}/runtime_arg_ref.h | 8 +++---- .../include/task-spec}/serialization.h | 0 .../task-spec}/slot_grad_id.struct.toml | 4 ++-- .../include/task-spec}/slot_id_t.struct.toml | 0 .../slot_tensor_type_id.struct.toml | 4 ++-- .../include/task-spec}/slot_type.enum.toml | 0 .../task-spec}/task_arg_spec.variant.toml | 4 ++-- .../include/task-spec}/task_binding.h | 18 +++++++-------- .../include/task-spec}/task_id_t.enum.toml | 0 .../include/task-spec}/task_invocation.h | 2 +- .../task-spec}/task_invocation.struct.toml | 4 ++-- .../include/task-spec}/task_signature.h | 2 +- .../task-spec}/task_signature.struct.toml | 4 ++-- .../include/task-spec}/tensor_role.enum.toml | 0 .../include/task-spec}/tensor_type.enum.toml | 0 .../tensor_type_slot_spec.struct.toml | 6 ++--- .../task-spec}/tensor_type_t.variant.toml | 6 ++--- .../include/task-spec}/variadic_tensor_ref.h | 4 ++-- .../src/concrete_arg.cc | 2 +- .../src/op_arg_ref.cc | 2 +- .../src/op_arg_spec.cc | 2 +- .../src/op_task_invocation.cc | 4 ++-- .../src/op_task_signature.cc | 2 +- .../src/op_task_to_task_invocation.cc | 2 +- .../src/op_tensor_spec.cc | 2 +- .../src/runtime_arg_ref.cc | 4 ++-- .../src/task_invocation.cc | 2 +- .../src/task_signature.cc | 2 +- .../src/variadic_tensor_ref.cc | 2 +- 130 files changed, 225 insertions(+), 228 deletions(-) delete mode 100644 lib/local-execution/include/local-execution/tensor_lowering.h delete mode 100644 lib/local-execution/src/tensor_lowering.cc create mode 100644 lib/task-spec/CMakeLists.txt rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/arg_ref.h (97%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/concrete_arg.h (97%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/config.h (100%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/device_specific.h (97%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/device_specific_device_states.variant.toml (98%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/gradient_tensor_t.struct.toml (100%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/is_grad.enum.toml (100%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/is_trainable.enum.toml (100%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/loss_tensor_t.struct.toml (100%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/lowered_tensor_t.struct.toml (100%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/op_arg_ref.h (79%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/op_arg_ref_type.variant.toml (73%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/op_arg_spec.h (85%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/op_arg_spec.variant.toml (76%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/op_slot_options.enum.toml (100%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/op_task_invocation.h (86%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/op_task_signature.h (91%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/op_task_to_task_invocation.h (79%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/op_task_type.enum.toml (100%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/op_tensor_slot_spec.struct.toml (68%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/op_tensor_spec.h (92%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/optimizer_tensor_t.struct.toml (100%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/parallel_tensor_shape_ref_type.struct.toml (100%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/per_device_op_state.h (71%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/per_device_op_state.variant.toml (100%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/per_device_op_state_ref_type.struct.toml (100%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/profiling.h (100%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/runtime_arg_config.h (80%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/runtime_arg_ref.h (81%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/serialization.h (100%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/slot_grad_id.struct.toml (75%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/slot_id_t.struct.toml (100%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/slot_tensor_type_id.struct.toml (76%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/slot_type.enum.toml (100%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/task_arg_spec.variant.toml (77%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/task_binding.h (82%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/task_id_t.enum.toml (100%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/task_invocation.h (81%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/task_invocation.struct.toml (75%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/task_signature.h (97%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/task_signature.struct.toml (86%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/tensor_role.enum.toml (100%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/tensor_type.enum.toml (100%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/tensor_type_slot_spec.struct.toml (72%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/tensor_type_t.variant.toml (76%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/variadic_tensor_ref.h (81%) rename lib/{local-execution => task-spec}/src/concrete_arg.cc (94%) rename lib/{local-execution => task-spec}/src/op_arg_ref.cc (87%) rename lib/{local-execution => task-spec}/src/op_arg_spec.cc (83%) rename lib/{local-execution => task-spec}/src/op_task_invocation.cc (97%) rename lib/{local-execution => task-spec}/src/op_task_signature.cc (99%) rename lib/{local-execution => task-spec}/src/op_task_to_task_invocation.cc (98%) rename lib/{local-execution => task-spec}/src/op_tensor_spec.cc (89%) rename lib/{local-execution => task-spec}/src/runtime_arg_ref.cc (89%) rename lib/{local-execution => task-spec}/src/task_invocation.cc (77%) rename lib/{local-execution => task-spec}/src/task_signature.cc (93%) rename lib/{local-execution => task-spec}/src/variadic_tensor_ref.cc (75%) diff --git a/.proj.toml b/.proj.toml index 10307a6efa..94c2510671 100644 --- a/.proj.toml +++ b/.proj.toml @@ -12,6 +12,7 @@ build_targets = [ "compiler", "substitution-generator", "local-execution", + "task-spec", "models", "export-model-arch", "substitution-to-dot", diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 972c656126..e2e561c384 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -4,6 +4,7 @@ add_subdirectory(runtime) add_subdirectory(op-attrs) add_subdirectory(kernels) add_subdirectory(local-execution) +add_subdirectory(task-spec) add_subdirectory(utils) add_subdirectory(ffi) add_subdirectory(substitutions) diff --git a/lib/local-execution/CMakeLists.txt b/lib/local-execution/CMakeLists.txt index f649f86ce3..db0cf7603f 100644 --- a/lib/local-execution/CMakeLists.txt +++ b/lib/local-execution/CMakeLists.txt @@ -11,6 +11,7 @@ ff_add_library( op-attrs utils kernels + task-spec pcg spdlog ) diff --git a/lib/local-execution/include/local-execution/generic_task_impl_function.h b/lib/local-execution/include/local-execution/generic_task_impl_function.h index 425740f61d..9ce22ecf54 100644 --- a/lib/local-execution/include/local-execution/generic_task_impl_function.h +++ b/lib/local-execution/include/local-execution/generic_task_impl_function.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_GENERIC_TASK_IMPL_FUNCTION_H #define _FLEXFLOW_LOCAL_EXECUTION_GENERIC_TASK_IMPL_FUNCTION_H -#include "local-execution/device_specific_device_states.dtg.h" #include "local-execution/task_argument_accessor.h" +#include "task-spec/device_specific_device_states.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/gradient_tensor_source.h b/lib/local-execution/include/local-execution/gradient_tensor_source.h index bb7a4c7aa8..e7d24d1ca5 100644 --- a/lib/local-execution/include/local-execution/gradient_tensor_source.h +++ b/lib/local-execution/include/local-execution/gradient_tensor_source.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_GRADIENT_TENSOR_SOURCE_H #define _FLEXFLOW_LOCAL_EXECUTION_GRADIENT_TENSOR_SOURCE_H -#include "local-execution/gradient_tensor_t.dtg.h" +#include "task-spec/gradient_tensor_t.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/init_op_task_impl_function.h b/lib/local-execution/include/local-execution/init_op_task_impl_function.h index 7b23a2bc64..0481e31a5f 100644 --- a/lib/local-execution/include/local-execution/init_op_task_impl_function.h +++ b/lib/local-execution/include/local-execution/init_op_task_impl_function.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_INIT_TASK_IMPL_FUNCTION_H #define _FLEXFLOW_LOCAL_EXECUTION_INIT_TASK_IMPL_FUNCTION_H -#include "local-execution/device_specific_device_states.dtg.h" #include "local-execution/task_argument_accessor.h" +#include "task-spec/device_specific_device_states.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/itask_argument_accessor.h b/lib/local-execution/include/local-execution/itask_argument_accessor.h index 9eff9460c2..24b3b3a37f 100644 --- a/lib/local-execution/include/local-execution/itask_argument_accessor.h +++ b/lib/local-execution/include/local-execution/itask_argument_accessor.h @@ -2,10 +2,10 @@ #define _FLEXFLOW_LOCAL_EXECUTION_ITASK_ARGUMENT_ACCESSOR_H #include "kernels/allocation.h" -#include "local-execution/concrete_arg.h" -#include "local-execution/op_task_signature.h" #include "local-execution/privilege_tensor_accessor.h" -#include "local-execution/tensor_type.dtg.h" +#include "task-spec/concrete_arg.h" +#include "task-spec/op_task_signature.h" +#include "task-spec/tensor_type.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/local_args_backing.h b/lib/local-execution/include/local-execution/local_args_backing.h index 6e6839fea7..4c9ede54fd 100644 --- a/lib/local-execution/include/local-execution/local_args_backing.h +++ b/lib/local-execution/include/local-execution/local_args_backing.h @@ -2,12 +2,12 @@ #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_ARGS_BACKING_H #include "local-execution/local_task_argument_accessor.h" -#include "local-execution/op_task_invocation.h" -#include "local-execution/per_device_op_state.h" -#include "local-execution/runtime_arg_config.h" -#include "local-execution/task_invocation.dtg.h" #include "pcg/computation_graph.h" #include "pcg/layer_guid_t.dtg.h" +#include "task-spec/op_task_invocation.h" +#include "task-spec/per_device_op_state.h" +#include "task-spec/runtime_arg_config.h" +#include "task-spec/task_invocation.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/local_cost_estimator.h b/lib/local-execution/include/local-execution/local_cost_estimator.h index 350d8f5abd..0189475fcb 100644 --- a/lib/local-execution/include/local-execution/local_cost_estimator.h +++ b/lib/local-execution/include/local-execution/local_cost_estimator.h @@ -2,7 +2,7 @@ #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_COST_ESTIMATOR_H #include "local-execution/cost_estimate.h" -#include "local-execution/runtime_arg_config.h" +#include "task-spec/runtime_arg_config.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/local_task_argument_accessor.h b/lib/local-execution/include/local-execution/local_task_argument_accessor.h index db0e98c2b1..b1e5a02985 100644 --- a/lib/local-execution/include/local-execution/local_task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/local_task_argument_accessor.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H -#include "local-execution/slot_tensor_type_id.dtg.h" #include "local-execution/task_argument_accessor.h" +#include "task-spec/slot_tensor_type_id.dtg.h" #include #include diff --git a/lib/local-execution/include/local-execution/local_tensor_backing.h b/lib/local-execution/include/local-execution/local_tensor_backing.h index 825ff0553e..9d35373784 100644 --- a/lib/local-execution/include/local-execution/local_tensor_backing.h +++ b/lib/local-execution/include/local-execution/local_tensor_backing.h @@ -6,19 +6,19 @@ #include "local-execution/gradient_tensor_source.h" #include "local-execution/local_task_argument_accessor.h" #include "local-execution/loss_tensor_source.h" -#include "local-execution/loss_tensor_t.dtg.h" #include "local-execution/lowered_tensor_source.h" -#include "local-execution/lowered_tensor_t.dtg.h" #include "local-execution/optimizer_tensor_source.h" -#include "local-execution/optimizer_tensor_t.dtg.h" -#include "local-execution/task_invocation.dtg.h" -#include "local-execution/tensor_role.dtg.h" -#include "local-execution/tensor_type_t.dtg.h" #include "op-attrs/tensor_shape.dtg.h" #include "pcg/computation_graph.dtg.h" #include "pcg/layer_guid_t.dtg.h" #include "pcg/optimizer_attrs.dtg.h" #include "pcg/tensor_guid_t.dtg.h" +#include "task-spec/loss_tensor_t.dtg.h" +#include "task-spec/lowered_tensor_t.dtg.h" +#include "task-spec/optimizer_tensor_t.dtg.h" +#include "task-spec/task_invocation.dtg.h" +#include "task-spec/tensor_role.dtg.h" +#include "task-spec/tensor_type_t.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/loss_functions.h b/lib/local-execution/include/local-execution/loss_functions.h index b2a6d610c3..c06908503a 100644 --- a/lib/local-execution/include/local-execution/loss_functions.h +++ b/lib/local-execution/include/local-execution/loss_functions.h @@ -16,12 +16,12 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_ #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_ -#include "local-execution/loss_tensor_t.dtg.h" #include "local-execution/task_impl_function.dtg.h" -#include "local-execution/task_invocation.dtg.h" -#include "local-execution/task_signature.h" #include "op-attrs/ops/loss_functions.h" #include "pcg/tensor_guid_t.dtg.h" +#include "task-spec/loss_tensor_t.dtg.h" +#include "task-spec/task_invocation.dtg.h" +#include "task-spec/task_signature.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/loss_tensor_source.h b/lib/local-execution/include/local-execution/loss_tensor_source.h index 2b55f1af01..d9858cde40 100644 --- a/lib/local-execution/include/local-execution/loss_tensor_source.h +++ b/lib/local-execution/include/local-execution/loss_tensor_source.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOSS_TENSOR_SOURCE_H #define _FLEXFLOW_LOCAL_EXECUTION_LOSS_TENSOR_SOURCE_H -#include "local-execution/loss_tensor_t.dtg.h" +#include "task-spec/loss_tensor_t.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/lowered_tensor_source.h b/lib/local-execution/include/local-execution/lowered_tensor_source.h index e4fc4ff56c..bd0b90dd75 100644 --- a/lib/local-execution/include/local-execution/lowered_tensor_source.h +++ b/lib/local-execution/include/local-execution/lowered_tensor_source.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOWERED_TENSOR_SOURCE_H #define _FLEXFLOW_LOCAL_EXECUTION_LOWERED_TENSOR_SOURCE_H -#include "local-execution/lowered_tensor_t.dtg.h" +#include "task-spec/lowered_tensor_t.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h index bf0fc1a3c0..c264418abc 100644 --- a/lib/local-execution/include/local-execution/model_training_instance.h +++ b/lib/local-execution/include/local-execution/model_training_instance.h @@ -2,9 +2,9 @@ #define _FLEXFLOW_LOCAL_EXECUTION_MODEL_TRAINING_INSTANCE_H #include "local-execution/local_training_backing.h" -#include "local-execution/loss_tensor_t.dtg.h" #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" #include "pcg/tensor_guid_t.dtg.h" +#include "task-spec/loss_tensor_t.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/attention.h b/lib/local-execution/include/local-execution/ops/attention.h index 96f5aadcd9..bf5385f609 100644 --- a/lib/local-execution/include/local-execution/ops/attention.h +++ b/lib/local-execution/include/local-execution/ops/attention.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_ATTENTION_H #define _FLEXFLOW_ATTENTION_H -#include "local-execution/op_task_invocation.h" -#include "local-execution/sim_environment.h" +#include "local-execution/task_impl_function.dtg.h" #include "op-attrs/ops/attention.h" +#include "task-spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/batch_matmul.h b/lib/local-execution/include/local-execution/ops/batch_matmul.h index 23389d5083..64d220ab66 100644 --- a/lib/local-execution/include/local-execution/ops/batch_matmul.h +++ b/lib/local-execution/include/local-execution/ops/batch_matmul.h @@ -1,10 +1,10 @@ #ifndef _FLEXFLOW_BATCH_MATMUL_H #define _FLEXFLOW_BATCH_MATMUL_H -#include "local-execution/op_task_invocation.h" -#include "local-execution/op_task_signature.h" -#include "local-execution/sim_environment.h" +#include "local-execution/task_impl_function.dtg.h" #include "op-attrs/ops/batch_matmul_attrs.dtg.h" +#include "task-spec/op_task_invocation.h" +#include "task-spec/op_task_signature.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/batch_norm.h b/lib/local-execution/include/local-execution/ops/batch_norm.h index 36aa8ffa4e..85a7190ce1 100644 --- a/lib/local-execution/include/local-execution/ops/batch_norm.h +++ b/lib/local-execution/include/local-execution/ops/batch_norm.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_BATCH_NORM_H #define _FLEXFLOW_BATCH_NORM_H -#include "local-execution/op_task_invocation.h" -#include "local-execution/sim_environment.h" +#include "local-execution/task_impl_function.dtg.h" #include "op-attrs/ops/batch_norm_attrs.dtg.h" +#include "task-spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/cast.h b/lib/local-execution/include/local-execution/ops/cast.h index e7af6aca6b..6a27ad267a 100644 --- a/lib/local-execution/include/local-execution/ops/cast.h +++ b/lib/local-execution/include/local-execution/ops/cast.h @@ -15,9 +15,9 @@ #ifndef _FLEXFLOW_CAST_H #define _FLEXFLOW_CAST_H -#include "local-execution/op_task_invocation.h" -#include "local-execution/sim_environment.h" +#include "local-execution/task_impl_function.dtg.h" #include "op-attrs/ops/cast_attrs.dtg.h" +#include "task-spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/combine.h b/lib/local-execution/include/local-execution/ops/combine.h index e85e8fba39..00e9cbed2c 100644 --- a/lib/local-execution/include/local-execution/ops/combine.h +++ b/lib/local-execution/include/local-execution/ops/combine.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_COMBINE_H #define _FLEXFLOW_COMBINE_H -#include "local-execution/op_task_invocation.h" -#include "local-execution/sim_environment.h" +#include "local-execution/task_impl_function.dtg.h" #include "op-attrs/ops/combine_attrs.dtg.h" +#include "task-spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/concat.h b/lib/local-execution/include/local-execution/ops/concat.h index eab70d621c..c46164e417 100644 --- a/lib/local-execution/include/local-execution/ops/concat.h +++ b/lib/local-execution/include/local-execution/ops/concat.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_CONCAT_H #define _FLEXFLOW_CONCAT_H -#include "local-execution/op_task_invocation.h" -#include "local-execution/sim_environment.h" +#include "local-execution/task_impl_function.dtg.h" #include "op-attrs/ops/concat_attrs.dtg.h" +#include "task-spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/conv_2d.h b/lib/local-execution/include/local-execution/ops/conv_2d.h index 0358d71eea..f3bb34ffeb 100644 --- a/lib/local-execution/include/local-execution/ops/conv_2d.h +++ b/lib/local-execution/include/local-execution/ops/conv_2d.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_CONV_2D_H #define _FLEXFLOW_CONV_2D_H -#include "local-execution/op_task_invocation.h" -#include "local-execution/sim_environment.h" +#include "local-execution/task_impl_function.dtg.h" #include "op-attrs/ops/conv_2d_attrs.dtg.h" +#include "task-spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/dropout.h b/lib/local-execution/include/local-execution/ops/dropout.h index a3dc5ff8af..bd7b426c6b 100644 --- a/lib/local-execution/include/local-execution/ops/dropout.h +++ b/lib/local-execution/include/local-execution/ops/dropout.h @@ -1,10 +1,10 @@ #ifndef _FLEXFLOW_DROPOUT_H #define _FLEXFLOW_DROPOUT_H -#include "local-execution/op_task_invocation.h" -#include "local-execution/sim_environment.h" -#include "local-execution/task_id_t.dtg.h" +#include "local-execution/task_impl_function.dtg.h" #include "op-attrs/ops/dropout_attrs.dtg.h" +#include "task-spec/op_task_invocation.h" +#include "task-spec/task_id_t.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/element_binary.h b/lib/local-execution/include/local-execution/ops/element_binary.h index 72c0976df8..4e0bb46e47 100644 --- a/lib/local-execution/include/local-execution/ops/element_binary.h +++ b/lib/local-execution/include/local-execution/ops/element_binary.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_ELEMENT_BINARY_H #define _FLEXFLOW_ELEMENT_BINARY_H -#include "local-execution/sim_environment.h" +#include "local-execution/task_impl_function.dtg.h" #include "local-execution/task_signature_impl.h" #include "op-attrs/ops/element_binary_attrs.dtg.h" diff --git a/lib/local-execution/include/local-execution/ops/element_unary.h b/lib/local-execution/include/local-execution/ops/element_unary.h index 04a72e2e12..9900668d6c 100644 --- a/lib/local-execution/include/local-execution/ops/element_unary.h +++ b/lib/local-execution/include/local-execution/ops/element_unary.h @@ -1,9 +1,9 @@ #ifndef _ELEMENT_UNARY_H #define _ELEMENT_UNARY_H -#include "local-execution/op_task_invocation.h" -#include "local-execution/sim_environment.h" +#include "local-execution/task_impl_function.dtg.h" #include "op-attrs/ops/element_unary_attrs.dtg.h" +#include "task-spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/embedding.h b/lib/local-execution/include/local-execution/ops/embedding.h index 995d2296e1..b998aef53e 100644 --- a/lib/local-execution/include/local-execution/ops/embedding.h +++ b/lib/local-execution/include/local-execution/ops/embedding.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_EMBEDDING_H #define _FLEXFLOW_EMBEDDING_H -#include "local-execution/op_task_invocation.h" -#include "local-execution/sim_environment.h" +#include "local-execution/task_impl_function.dtg.h" #include "op-attrs/ops/embedding_attrs.dtg.h" +#include "task-spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/flat.h b/lib/local-execution/include/local-execution/ops/flat.h index e019bfc654..95afb98340 100644 --- a/lib/local-execution/include/local-execution/ops/flat.h +++ b/lib/local-execution/include/local-execution/ops/flat.h @@ -1,8 +1,9 @@ #ifndef _FLEXFLOW_FLAT_H #define _FLEXFLOW_FLAT_H -#include "local-execution/sim_environment.h" +#include "local-execution/task_impl_function.dtg.h" #include "op-attrs/ops/flat_attrs.dtg.h" +#include "task-spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/gather.h b/lib/local-execution/include/local-execution/ops/gather.h index e339683381..5569a94728 100644 --- a/lib/local-execution/include/local-execution/ops/gather.h +++ b/lib/local-execution/include/local-execution/ops/gather.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_GATHER_H #define _FLEXFLOW_GATHER_H -#include "local-execution/op_task_invocation.h" -#include "local-execution/sim_environment.h" +#include "local-execution/task_impl_function.dtg.h" #include "op-attrs/ops/gather_attrs.dtg.h" +#include "task-spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/input.h b/lib/local-execution/include/local-execution/ops/input.h index baad25b798..9181478363 100644 --- a/lib/local-execution/include/local-execution/ops/input.h +++ b/lib/local-execution/include/local-execution/ops/input.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_INPUT_H #define _FLEXFLOW_INPUT_H -#include "local-execution/op_task_invocation.h" #include "op-attrs/ops/input_attrs.dtg.h" +#include "task-spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/layer_norm.h b/lib/local-execution/include/local-execution/ops/layer_norm.h index 8e034ac519..e4a15caac2 100644 --- a/lib/local-execution/include/local-execution/ops/layer_norm.h +++ b/lib/local-execution/include/local-execution/ops/layer_norm.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_RUNTIME_SRC_OPS_LAYER_NORM_H #define _FLEXFLOW_RUNTIME_SRC_OPS_LAYER_NORM_H -#include "local-execution/op_task_invocation.h" -#include "local-execution/sim_environment.h" +#include "local-execution/task_impl_function.dtg.h" #include "op-attrs/ops/layer_norm_attrs.dtg.h" +#include "task-spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/linear.h b/lib/local-execution/include/local-execution/ops/linear.h index 2aaf13a95a..d58d876865 100644 --- a/lib/local-execution/include/local-execution/ops/linear.h +++ b/lib/local-execution/include/local-execution/ops/linear.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_LINEAR_H #define _FLEXFLOW_LINEAR_H -#include "local-execution/op_task_invocation.h" -#include "local-execution/sim_environment.h" +#include "local-execution/task_impl_function.dtg.h" #include "op-attrs/ops/linear_attrs.dtg.h" +#include "task-spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/noop.h b/lib/local-execution/include/local-execution/ops/noop.h index 1097adeb5e..adbc15cd3b 100644 --- a/lib/local-execution/include/local-execution/ops/noop.h +++ b/lib/local-execution/include/local-execution/ops/noop.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_NOOP_H #define _FLEXFLOW_NOOP_H -#include "local-execution/op_task_invocation.h" #include "op-attrs/ops/noop_attrs.dtg.h" +#include "task-spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/pool_2d.h b/lib/local-execution/include/local-execution/ops/pool_2d.h index 908fd5462f..7d0ec44bd7 100644 --- a/lib/local-execution/include/local-execution/ops/pool_2d.h +++ b/lib/local-execution/include/local-execution/ops/pool_2d.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_POOL_2D_H #define _FLEXFLOW_POOL_2D_H -#include "local-execution/op_task_invocation.h" -#include "local-execution/sim_environment.h" +#include "local-execution/task_impl_function.dtg.h" #include "op-attrs/ops/pool_2d_attrs.dtg.h" +#include "task-spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/reduce.h b/lib/local-execution/include/local-execution/ops/reduce.h index 7900c28159..5c6d4be338 100644 --- a/lib/local-execution/include/local-execution/ops/reduce.h +++ b/lib/local-execution/include/local-execution/ops/reduce.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H #define _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H -#include "local-execution/op_task_invocation.h" -#include "local-execution/sim_environment.h" +#include "local-execution/task_impl_function.dtg.h" #include "op-attrs/ops/reduce_attrs.dtg.h" +#include "task-spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/reduction.h b/lib/local-execution/include/local-execution/ops/reduction.h index 56833602e6..7475d3aeb4 100644 --- a/lib/local-execution/include/local-execution/ops/reduction.h +++ b/lib/local-execution/include/local-execution/ops/reduction.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_REDUCTION_H #define _FLEXFLOW_REDUCTION_H -#include "local-execution/op_task_invocation.h" -#include "local-execution/sim_environment.h" +#include "local-execution/task_impl_function.dtg.h" #include "op-attrs/ops/reduction_attrs.dtg.h" +#include "task-spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/repartition.h b/lib/local-execution/include/local-execution/ops/repartition.h index 5187d04ca0..08ecdafcf2 100644 --- a/lib/local-execution/include/local-execution/ops/repartition.h +++ b/lib/local-execution/include/local-execution/ops/repartition.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_PARTITION_H #define _FLEXFLOW_PARTITION_H -#include "local-execution/op_task_invocation.h" -#include "local-execution/sim_environment.h" +#include "local-execution/task_impl_function.dtg.h" #include "op-attrs/ops/repartition_attrs.dtg.h" +#include "task-spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/replicate.h b/lib/local-execution/include/local-execution/ops/replicate.h index 85d1dff41a..b827b9c272 100644 --- a/lib/local-execution/include/local-execution/ops/replicate.h +++ b/lib/local-execution/include/local-execution/ops/replicate.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_REPLICATE_H #define _FLEXFLOW_REPLICATE_H -#include "local-execution/op_task_invocation.h" -#include "local-execution/sim_environment.h" +#include "local-execution/task_impl_function.dtg.h" #include "op-attrs/ops/replicate_attrs.dtg.h" +#include "task-spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/reshape.h b/lib/local-execution/include/local-execution/ops/reshape.h index 37f07534ee..ed7e6e9e31 100644 --- a/lib/local-execution/include/local-execution/ops/reshape.h +++ b/lib/local-execution/include/local-execution/ops/reshape.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_RESHAPE_H #define _FLEXFLOW_RESHAPE_H -#include "local-execution/op_task_invocation.h" -#include "local-execution/sim_environment.h" +#include "local-execution/task_impl_function.dtg.h" #include "op-attrs/ops/reshape_attrs.dtg.h" +#include "task-spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/reverse.h b/lib/local-execution/include/local-execution/ops/reverse.h index 7c16073be7..dd0e89ecad 100644 --- a/lib/local-execution/include/local-execution/ops/reverse.h +++ b/lib/local-execution/include/local-execution/ops/reverse.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_REVERSE_H_ #define _FLEXFLOW_REVERSE_H_ -#include "local-execution/op_task_invocation.h" -#include "local-execution/sim_environment.h" +#include "local-execution/task_impl_function.dtg.h" #include "op-attrs/ops/reverse_attrs.dtg.h" +#include "task-spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/softmax.h b/lib/local-execution/include/local-execution/ops/softmax.h index d440fe7239..294d948b42 100644 --- a/lib/local-execution/include/local-execution/ops/softmax.h +++ b/lib/local-execution/include/local-execution/ops/softmax.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_SOFTMAX_H #define _FLEXFLOW_SOFTMAX_H -#include "local-execution/op_task_invocation.h" -#include "local-execution/sim_environment.h" +#include "local-execution/task_impl_function.dtg.h" #include "op-attrs/ops/softmax_attrs.dtg.h" +#include "task-spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/split.h b/lib/local-execution/include/local-execution/ops/split.h index dde46c20bf..49cd7cfc7b 100644 --- a/lib/local-execution/include/local-execution/ops/split.h +++ b/lib/local-execution/include/local-execution/ops/split.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_SPLIT_H #define _FLEXFLOW_SPLIT_H -#include "local-execution/op_task_invocation.h" -#include "local-execution/sim_environment.h" +#include "local-execution/task_impl_function.dtg.h" #include "op-attrs/ops/split_attrs.dtg.h" +#include "task-spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/topk.h b/lib/local-execution/include/local-execution/ops/topk.h index c8f3175ebd..aeded512cd 100644 --- a/lib/local-execution/include/local-execution/ops/topk.h +++ b/lib/local-execution/include/local-execution/ops/topk.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_TOPK_H_ #define _FLEXFLOW_TOPK_H_ -#include "local-execution/op_task_invocation.h" -#include "local-execution/sim_environment.h" +#include "local-execution/task_impl_function.dtg.h" #include "op-attrs/ops/topk_attrs.dtg.h" +#include "task-spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/transpose.h b/lib/local-execution/include/local-execution/ops/transpose.h index f2ce014aa7..2c7b5fb3bc 100644 --- a/lib/local-execution/include/local-execution/ops/transpose.h +++ b/lib/local-execution/include/local-execution/ops/transpose.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_TRANSPOSE_H_ #define _FLEXFLOW_TRANSPOSE_H_ -#include "local-execution/op_task_invocation.h" -#include "local-execution/sim_environment.h" +#include "local-execution/task_impl_function.dtg.h" #include "op-attrs/ops/transpose_attrs.dtg.h" +#include "task-spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/weight.h b/lib/local-execution/include/local-execution/ops/weight.h index e59a88f07d..162236e41e 100644 --- a/lib/local-execution/include/local-execution/ops/weight.h +++ b/lib/local-execution/include/local-execution/ops/weight.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_WEIGHT_H #define _FLEXFLOW_WEIGHT_H -#include "local-execution/op_task_invocation.h" #include "op-attrs/ops/weight_attrs.dtg.h" +#include "task-spec/op_task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h index 3a092e34c6..f6bd5a3ee9 100644 --- a/lib/local-execution/include/local-execution/optimizer.h +++ b/lib/local-execution/include/local-execution/optimizer.h @@ -2,11 +2,11 @@ #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_ #include "local-execution/task_impl_function.dtg.h" -#include "local-execution/task_invocation.dtg.h" -#include "local-execution/task_signature.h" #include "pcg/optimizer_attrs.dtg.h" #include "pcg/optimizers/adam_optimizer_attrs.dtg.h" #include "pcg/optimizers/sgd_optimizer_attrs.dtg.h" +#include "task-spec/task_invocation.dtg.h" +#include "task-spec/task_signature.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/optimizer_tensor_source.h b/lib/local-execution/include/local-execution/optimizer_tensor_source.h index 658c545225..7a5057c84a 100644 --- a/lib/local-execution/include/local-execution/optimizer_tensor_source.h +++ b/lib/local-execution/include/local-execution/optimizer_tensor_source.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_OPTIMIZER_TENSOR_SOURCE_H #define _FLEXFLOW_LOCAL_EXECUTION_OPTIMIZER_TENSOR_SOURCE_H -#include "local-execution/optimizer_tensor_t.dtg.h" +#include "task-spec/optimizer_tensor_t.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/sim_environment.h b/lib/local-execution/include/local-execution/sim_environment.h index 7c81cba408..6c2f8d4ebb 100644 --- a/lib/local-execution/include/local-execution/sim_environment.h +++ b/lib/local-execution/include/local-execution/sim_environment.h @@ -4,11 +4,11 @@ #include "kernels/accessor.h" #include "kernels/allocation.h" #include "local-execution/cost_metrics.h" -#include "local-execution/op_task_invocation.h" #include "local-execution/task_argument_accessor.h" #include "local-execution/task_signature_impl.h" #include "op-attrs/parallel_tensor_shape.dtg.h" #include "pcg/machine_view.h" +#include "task-spec/op_task_invocation.h" #include namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h index 0cbeaf04c8..99c1c1296b 100644 --- a/lib/local-execution/include/local-execution/task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/task_argument_accessor.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H #define _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H -#include "local-execution/device_specific.h" #include "local-execution/itask_argument_accessor.h" -#include "local-execution/per_device_op_state.dtg.h" +#include "task-spec/device_specific.h" +#include "task-spec/per_device_op_state.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/task_registry.h b/lib/local-execution/include/local-execution/task_registry.h index cb717ca2af..22cc344b3d 100644 --- a/lib/local-execution/include/local-execution/task_registry.h +++ b/lib/local-execution/include/local-execution/task_registry.h @@ -2,10 +2,10 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_REGISTRY_H #define _FLEXFLOW_LOCAL_EXECUTION_TASK_REGISTRY_H -#include "local-execution/op_task_type.dtg.h" #include "local-execution/task_registry.dtg.h" #include "op-attrs/computation_graph_op_attrs.h" #include "pcg/computation_graph.dtg.h" +#include "task-spec/op_task_type.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/task_registry.struct.toml b/lib/local-execution/include/local-execution/task_registry.struct.toml index ada467a67d..c3784b617f 100644 --- a/lib/local-execution/include/local-execution/task_registry.struct.toml +++ b/lib/local-execution/include/local-execution/task_registry.struct.toml @@ -8,7 +8,7 @@ features = [ includes = [ "local-execution/task_signature_impl.dtg.h", - "local-execution/task_id_t.dtg.h", + "task-spec/task_id_t.dtg.h", "pcg/layer_guid_t.dtg.h", ] diff --git a/lib/local-execution/include/local-execution/task_signature_impl.h b/lib/local-execution/include/local-execution/task_signature_impl.h index 98c5c0cb3b..613a173f25 100644 --- a/lib/local-execution/include/local-execution/task_signature_impl.h +++ b/lib/local-execution/include/local-execution/task_signature_impl.h @@ -1,10 +1,10 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_SIGNATURE_IMPL_H #define _FLEXFLOW_LOCAL_EXECUTION_TASK_SIGNATURE_IMPL_H -#include "local-execution/op_task_invocation.h" -#include "local-execution/task_id_t.dtg.h" #include "local-execution/task_signature_impl.dtg.h" #include "op-attrs/computation_graph_op_attrs.h" +#include "task-spec/op_task_invocation.h" +#include "task-spec/task_id_t.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/task_signature_impl.struct.toml b/lib/local-execution/include/local-execution/task_signature_impl.struct.toml index 981794503b..78064203ec 100644 --- a/lib/local-execution/include/local-execution/task_signature_impl.struct.toml +++ b/lib/local-execution/include/local-execution/task_signature_impl.struct.toml @@ -8,7 +8,7 @@ features = [ includes = [ "local-execution/task_impl_function.dtg.h", - "local-execution/op_task_signature.h", + "task-spec/op_task_signature.h", ] [[fields]] diff --git a/lib/local-execution/include/local-execution/tasks.h b/lib/local-execution/include/local-execution/tasks.h index 4f5b26c43b..aae3b3fe44 100644 --- a/lib/local-execution/include/local-execution/tasks.h +++ b/lib/local-execution/include/local-execution/tasks.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASKS_H #define _FLEXFLOW_LOCAL_EXECUTION_TASKS_H -#include "local-execution/task_id_t.dtg.h" +#include "task-spec/task_id_t.dtg.h" #include #include #include diff --git a/lib/local-execution/include/local-execution/tensor_lowering.h b/lib/local-execution/include/local-execution/tensor_lowering.h deleted file mode 100644 index 5f3870c1d2..0000000000 --- a/lib/local-execution/include/local-execution/tensor_lowering.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_TENSOR_REDUCTION_H -#define _FLEXFLOW_LOCAL_EXECUTION_TENSOR_REDUCTION_H - -#include "local-execution/lowered_tensor_t.dtg.h" -#include "pcg/tensor_guid_t.dtg.h" - -namespace FlexFlow { - -lowered_tensor_t lower(tensor_guid_t const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/src/local-execution/ops/attention.cc b/lib/local-execution/src/local-execution/ops/attention.cc index 6401d5beac..a9e6a9fa30 100644 --- a/lib/local-execution/src/local-execution/ops/attention.cc +++ b/lib/local-execution/src/local-execution/ops/attention.cc @@ -15,9 +15,9 @@ #include "local-execution/ops/attention.h" #include "kernels/attention_kernels.h" -#include "local-execution/op_task_signature.h" #include "op-attrs/ops/attention.h" #include "op-attrs/ops/attention/multihead_attention_parallel_inputs.h" +#include "task-spec/op_task_signature.h" namespace FlexFlow { diff --git a/lib/local-execution/src/local-execution/ops/batch_matmul.cc b/lib/local-execution/src/local-execution/ops/batch_matmul.cc index cd22cee283..c780ab6eca 100644 --- a/lib/local-execution/src/local-execution/ops/batch_matmul.cc +++ b/lib/local-execution/src/local-execution/ops/batch_matmul.cc @@ -15,9 +15,9 @@ #include "local-execution/ops/batch_matmul.h" #include "kernels/batch_matmul_kernels.h" -#include "local-execution/op_task_signature.h" #include "op-attrs/get_output_shapes.h" #include "op-attrs/ops/batch_matmul.h" +#include "task-spec/op_task_signature.h" #include "utils/containers/transform.h" #include "utils/nonnegative_int/nonnegative_range.h" diff --git a/lib/local-execution/src/local-execution/ops/cast.cc b/lib/local-execution/src/local-execution/ops/cast.cc index 846faa9262..e5dd7f9c4e 100644 --- a/lib/local-execution/src/local-execution/ops/cast.cc +++ b/lib/local-execution/src/local-execution/ops/cast.cc @@ -16,7 +16,7 @@ #include "local-execution/ops/cast.h" #include "kernels/cast_kernels.h" -#include "local-execution/op_task_signature.h" +#include "task-spec/op_task_signature.h" #include "utils/hash-utils.h" using namespace FlexFlow::Kernels::Cast; diff --git a/lib/local-execution/src/local-execution/ops/combine.cc b/lib/local-execution/src/local-execution/ops/combine.cc index b7e84878f4..32fab636d3 100644 --- a/lib/local-execution/src/local-execution/ops/combine.cc +++ b/lib/local-execution/src/local-execution/ops/combine.cc @@ -15,7 +15,7 @@ #include "local-execution/ops/combine.h" #include "kernels/combine_kernels.h" -#include "local-execution/op_task_invocation.h" +#include "task-spec/op_task_invocation.h" #include "utils/hash-utils.h" namespace FlexFlow { diff --git a/lib/local-execution/src/local-execution/ops/concat.cc b/lib/local-execution/src/local-execution/ops/concat.cc index dee1dd08e5..51370e7a4d 100644 --- a/lib/local-execution/src/local-execution/ops/concat.cc +++ b/lib/local-execution/src/local-execution/ops/concat.cc @@ -16,9 +16,9 @@ #include "local-execution/ops/concat.h" #include "kernels/concat_kernels.h" -#include "local-execution/op_task_signature.h" -#include "local-execution/variadic_tensor_ref.h" #include "op-attrs/get_output_shapes.h" +#include "task-spec/op_task_signature.h" +#include "task-spec/variadic_tensor_ref.h" #include "utils/hash-utils.h" namespace FlexFlow { diff --git a/lib/local-execution/src/local-execution/ops/dropout.cc b/lib/local-execution/src/local-execution/ops/dropout.cc index 017d023ec4..cef1ea6c93 100644 --- a/lib/local-execution/src/local-execution/ops/dropout.cc +++ b/lib/local-execution/src/local-execution/ops/dropout.cc @@ -1,8 +1,8 @@ #include "local-execution/ops/dropout.h" #include "kernels/dropout_kernels.h" -#include "local-execution/op_task_invocation.h" -#include "local-execution/op_task_signature.h" #include "op-attrs/get_output_shapes.h" +#include "task-spec/op_task_invocation.h" +#include "task-spec/op_task_signature.h" #include "utils/hash-utils.h" namespace FlexFlow { diff --git a/lib/local-execution/src/local_args_backing.cc b/lib/local-execution/src/local_args_backing.cc index d8a94fb2c5..715a96efa6 100644 --- a/lib/local-execution/src/local_args_backing.cc +++ b/lib/local-execution/src/local_args_backing.cc @@ -1,6 +1,6 @@ #include "local-execution/local_args_backing.h" -#include "local-execution/op_task_to_task_invocation.h" #include "op-attrs/parallel_tensor_shape.h" +#include "task-spec/op_task_to_task_invocation.h" #include "utils/containers/contains_key.h" #include "utils/containers/map_values.h" #include "utils/overload.h" diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc index 85789c9505..31418c6bea 100644 --- a/lib/local-execution/src/local_cost_estimator.cc +++ b/lib/local-execution/src/local_cost_estimator.cc @@ -1,7 +1,7 @@ #include "local-execution/local_cost_estimator.h" #include "kernels/device.h" #include "kernels/local_cuda_allocator.h" -#include "local-execution/tensor_lowering.h" + #include "local-execution/tracked_allocator.h" #include "op-attrs/computation_graph_op_attrs.h" #include "op-attrs/pcg_operator_attrs.h" diff --git a/lib/local-execution/src/local_tensor_backing.cc b/lib/local-execution/src/local_tensor_backing.cc index de058d88ad..00c170d501 100644 --- a/lib/local-execution/src/local_tensor_backing.cc +++ b/lib/local-execution/src/local_tensor_backing.cc @@ -1,6 +1,6 @@ #include "local-execution/local_tensor_backing.h" -#include "local-execution/slot_grad_id.dtg.h" -#include "local-execution/tensor_lowering.h" +#include "task-spec/slot_grad_id.dtg.h" + #include "op-attrs/parallel_tensor_shape.h" #include "pcg/computation_graph.h" #include "pcg/optimizer_attrs.h" diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index 144596820a..2679a502e3 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -1,10 +1,10 @@ #include "local-execution/local_training_backing.h" #include "local-execution/loss_functions.h" -#include "local-execution/op_task_to_task_invocation.h" #include "local-execution/optimizer.h" -#include "local-execution/task_invocation.h" #include "local-execution/task_signature_impl.h" -#include "local-execution/tensor_lowering.h" +#include "task-spec/op_task_to_task_invocation.h" +#include "task-spec/task_invocation.h" + #include "pcg/computation_graph.h" #include "pcg/optimizer_attrs.h" #include "utils/containers/contains.h" diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc index 32b66629d3..15ebdd5f28 100644 --- a/lib/local-execution/src/loss_functions.cc +++ b/lib/local-execution/src/loss_functions.cc @@ -16,7 +16,7 @@ #include "op-attrs/ops/loss_functions.h" #include "kernels/loss_function_kernels.h" #include "local-execution/loss_functions.h" -#include "local-execution/profiling.h" +#include "task-spec/profiling.h" #include "utils/nonnegative_int/nonnegative_int.h" namespace FlexFlow { diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc index 76da26433d..a69ae9da61 100644 --- a/lib/local-execution/src/optimizer.cc +++ b/lib/local-execution/src/optimizer.cc @@ -1,6 +1,6 @@ #include "local-execution/optimizer.h" #include "kernels/optimizer_kernels.h" -#include "local-execution/profiling.h" +#include "task-spec/profiling.h" #include "utils/overload.h" namespace FlexFlow { diff --git a/lib/local-execution/src/per_device_state.cc b/lib/local-execution/src/per_device_state.cc index fa470b196d..a959f4a8c9 100644 --- a/lib/local-execution/src/per_device_state.cc +++ b/lib/local-execution/src/per_device_state.cc @@ -1,4 +1,4 @@ -#include "local-execution/per_device_op_state.h" +#include "task-spec/per_device_op_state.h" #include "utils/overload.h" namespace FlexFlow { diff --git a/lib/local-execution/src/task_binding.cc b/lib/local-execution/src/task_binding.cc index 7684511488..4537493c1d 100644 --- a/lib/local-execution/src/task_binding.cc +++ b/lib/local-execution/src/task_binding.cc @@ -1,4 +1,4 @@ -#include "local-execution/task_binding.h" +#include "task-spec/task_binding.h" #include "pcg/tensor_guid_t.dtg.h" #include "utils/containers/contains_key.h" #include "utils/fmt/unordered_map.h" diff --git a/lib/local-execution/src/tensor_lowering.cc b/lib/local-execution/src/tensor_lowering.cc deleted file mode 100644 index 63be366d94..0000000000 --- a/lib/local-execution/src/tensor_lowering.cc +++ /dev/null @@ -1,10 +0,0 @@ -#include "local-execution/tensor_lowering.h" -#include "utils/containers/transform.h" - -namespace FlexFlow { - -lowered_tensor_t lower(tensor_guid_t const &tensor_guid) { - return lowered_tensor_t{tensor_guid.raw_graph_output.node.raw_uid}; -} - -} // namespace FlexFlow diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc index 7568265446..e5ca58bc1f 100644 --- a/lib/local-execution/test/src/test_local_slots_backing.cc +++ b/lib/local-execution/test/src/test_local_slots_backing.cc @@ -2,7 +2,7 @@ #include "local-execution/local_cost_estimator.h" #include "local-execution/local_cpu_allocator.h" #include "local-execution/local_tensor_backing.h" -#include "local-execution/tensor_lowering.h" + #include "op-attrs/ops/attention.h" #include "op-attrs/parallel_tensor_shape.h" #include "pcg/computation_graph.h" diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc index 210cd1af83..62778c2e32 100644 --- a/lib/local-execution/test/src/test_loss_e2e.cc +++ b/lib/local-execution/test/src/test_loss_e2e.cc @@ -3,7 +3,7 @@ #include "kernels/managed_ff_stream.h" #include "kernels/managed_per_device_ff_handle.h" #include "local-execution/local_training_backing.h" -#include "local-execution/tensor_lowering.h" + #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" #include "pcg/computation_graph.h" #include "pcg/computation_graph_builder.h" diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc index d16c5e5b0b..4658a2a544 100644 --- a/lib/local-execution/test/src/test_update_e2e.cc +++ b/lib/local-execution/test/src/test_update_e2e.cc @@ -3,7 +3,7 @@ #include "kernels/managed_ff_stream.h" #include "kernels/managed_per_device_ff_handle.h" #include "local-execution/local_training_backing.h" -#include "local-execution/tensor_lowering.h" + #include "pcg/computation_graph.h" #include "pcg/computation_graph_builder.h" #include "pcg/optimizer_attrs.dtg.h" diff --git a/lib/task-spec/CMakeLists.txt b/lib/task-spec/CMakeLists.txt new file mode 100644 index 0000000000..8deb20a593 --- /dev/null +++ b/lib/task-spec/CMakeLists.txt @@ -0,0 +1,16 @@ +ff_add_library( + NAME + task-spec + SRC_PATTERNS + src/*.cc + PUBLIC_INCLUDE + include/ + PRIVATE_INCLUDE + src/ + DEPS + op-attrs + utils + kernels + pcg + spdlog +) diff --git a/lib/local-execution/include/local-execution/arg_ref.h b/lib/task-spec/include/task-spec/arg_ref.h similarity index 97% rename from lib/local-execution/include/local-execution/arg_ref.h rename to lib/task-spec/include/task-spec/arg_ref.h index 75eecda273..8d3402c578 100644 --- a/lib/local-execution/include/local-execution/arg_ref.h +++ b/lib/task-spec/include/task-spec/arg_ref.h @@ -2,7 +2,7 @@ #define _FLEXFLOW_LOCAL_EXECUTION_ARG_REF_H #include "kernels/ff_handle.h" -// #include "local-execution/serialization.h +// #include "task-spec/serialization.h #include "utils/type_index.h" #include "utils/visitable.h" diff --git a/lib/local-execution/include/local-execution/concrete_arg.h b/lib/task-spec/include/task-spec/concrete_arg.h similarity index 97% rename from lib/local-execution/include/local-execution/concrete_arg.h rename to lib/task-spec/include/task-spec/concrete_arg.h index cee52ba4a2..7b2ece59a7 100644 --- a/lib/local-execution/include/local-execution/concrete_arg.h +++ b/lib/task-spec/include/task-spec/concrete_arg.h @@ -2,7 +2,7 @@ #define _FLEXFLOW_LOCAL_EXECUTION_CONCRETE_ARG_H #include "fmt/format.h" -#include "local-execution/serialization.h" +#include "task-spec/serialization.h" #include "utils/hash-utils.h" #include "utils/type_index.h" #include diff --git a/lib/local-execution/include/local-execution/config.h b/lib/task-spec/include/task-spec/config.h similarity index 100% rename from lib/local-execution/include/local-execution/config.h rename to lib/task-spec/include/task-spec/config.h diff --git a/lib/local-execution/include/local-execution/device_specific.h b/lib/task-spec/include/task-spec/device_specific.h similarity index 97% rename from lib/local-execution/include/local-execution/device_specific.h rename to lib/task-spec/include/task-spec/device_specific.h index 4035aaf7cf..3ef017f704 100644 --- a/lib/local-execution/include/local-execution/device_specific.h +++ b/lib/task-spec/include/task-spec/device_specific.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_DEVICE_SPECIFIC_H #define _FLEXFLOW_LOCAL_EXECUTION_DEVICE_SPECIFIC_H -#include "local-execution/serialization.h" +#include "task-spec/serialization.h" #include "utils/exception.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/device_specific_device_states.variant.toml b/lib/task-spec/include/task-spec/device_specific_device_states.variant.toml similarity index 98% rename from lib/local-execution/include/local-execution/device_specific_device_states.variant.toml rename to lib/task-spec/include/task-spec/device_specific_device_states.variant.toml index db476e771d..944dddc3df 100644 --- a/lib/local-execution/include/local-execution/device_specific_device_states.variant.toml +++ b/lib/task-spec/include/task-spec/device_specific_device_states.variant.toml @@ -22,7 +22,7 @@ includes = [ "kernels/softmax_kernels.h", "kernels/topk_kernels.h", "kernels/transpose_kernels.h", - "local-execution/device_specific.h", + "task-spec/device_specific.h", ] [[values]] diff --git a/lib/local-execution/include/local-execution/gradient_tensor_t.struct.toml b/lib/task-spec/include/task-spec/gradient_tensor_t.struct.toml similarity index 100% rename from lib/local-execution/include/local-execution/gradient_tensor_t.struct.toml rename to lib/task-spec/include/task-spec/gradient_tensor_t.struct.toml diff --git a/lib/local-execution/include/local-execution/is_grad.enum.toml b/lib/task-spec/include/task-spec/is_grad.enum.toml similarity index 100% rename from lib/local-execution/include/local-execution/is_grad.enum.toml rename to lib/task-spec/include/task-spec/is_grad.enum.toml diff --git a/lib/local-execution/include/local-execution/is_trainable.enum.toml b/lib/task-spec/include/task-spec/is_trainable.enum.toml similarity index 100% rename from lib/local-execution/include/local-execution/is_trainable.enum.toml rename to lib/task-spec/include/task-spec/is_trainable.enum.toml diff --git a/lib/local-execution/include/local-execution/loss_tensor_t.struct.toml b/lib/task-spec/include/task-spec/loss_tensor_t.struct.toml similarity index 100% rename from lib/local-execution/include/local-execution/loss_tensor_t.struct.toml rename to lib/task-spec/include/task-spec/loss_tensor_t.struct.toml diff --git a/lib/local-execution/include/local-execution/lowered_tensor_t.struct.toml b/lib/task-spec/include/task-spec/lowered_tensor_t.struct.toml similarity index 100% rename from lib/local-execution/include/local-execution/lowered_tensor_t.struct.toml rename to lib/task-spec/include/task-spec/lowered_tensor_t.struct.toml diff --git a/lib/local-execution/include/local-execution/op_arg_ref.h b/lib/task-spec/include/task-spec/op_arg_ref.h similarity index 79% rename from lib/local-execution/include/local-execution/op_arg_ref.h rename to lib/task-spec/include/task-spec/op_arg_ref.h index 102a8d4362..d95573787a 100644 --- a/lib/local-execution/include/local-execution/op_arg_ref.h +++ b/lib/task-spec/include/task-spec/op_arg_ref.h @@ -1,11 +1,11 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_ARG_REF_H #define _FLEXFLOW_LOCAL_EXECUTION_OP_ARG_REF_H -#include "local-execution/arg_ref.h" -#include "local-execution/device_specific.h" -#include "local-execution/op_arg_ref_type.dtg.h" -#include "local-execution/per_device_op_state.h" #include "op-attrs/parallel_tensor_shape.dtg.h" +#include "task-spec/arg_ref.h" +#include "task-spec/device_specific.h" +#include "task-spec/op_arg_ref_type.dtg.h" +#include "task-spec/per_device_op_state.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/op_arg_ref_type.variant.toml b/lib/task-spec/include/task-spec/op_arg_ref_type.variant.toml similarity index 73% rename from lib/local-execution/include/local-execution/op_arg_ref_type.variant.toml rename to lib/task-spec/include/task-spec/op_arg_ref_type.variant.toml index cd226da161..e0452c6ce2 100644 --- a/lib/local-execution/include/local-execution/op_arg_ref_type.variant.toml +++ b/lib/task-spec/include/task-spec/op_arg_ref_type.variant.toml @@ -9,8 +9,8 @@ features = [ ] includes = [ - "local-execution/per_device_op_state_ref_type.dtg.h", - "local-execution/parallel_tensor_shape_ref_type.dtg.h", + "task-spec/per_device_op_state_ref_type.dtg.h", + "task-spec/parallel_tensor_shape_ref_type.dtg.h", ] [[values]] diff --git a/lib/local-execution/include/local-execution/op_arg_spec.h b/lib/task-spec/include/task-spec/op_arg_spec.h similarity index 85% rename from lib/local-execution/include/local-execution/op_arg_spec.h rename to lib/task-spec/include/task-spec/op_arg_spec.h index 4f3ccd066e..1dc4efcdd1 100644 --- a/lib/local-execution/include/local-execution/op_arg_spec.h +++ b/lib/task-spec/include/task-spec/op_arg_spec.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OP_ARG_SPEC_H #define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OP_ARG_SPEC_H -#include "local-execution/op_arg_spec.dtg.h" +#include "task-spec/op_arg_spec.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/op_arg_spec.variant.toml b/lib/task-spec/include/task-spec/op_arg_spec.variant.toml similarity index 76% rename from lib/local-execution/include/local-execution/op_arg_spec.variant.toml rename to lib/task-spec/include/task-spec/op_arg_spec.variant.toml index 28169902ae..e52e5c914e 100644 --- a/lib/local-execution/include/local-execution/op_arg_spec.variant.toml +++ b/lib/task-spec/include/task-spec/op_arg_spec.variant.toml @@ -10,9 +10,9 @@ features = [ ] includes = [ - "local-execution/concrete_arg.h", - "local-execution/op_arg_ref.h", - "local-execution/runtime_arg_ref.h", + "task-spec/concrete_arg.h", + "task-spec/op_arg_ref.h", + "task-spec/runtime_arg_ref.h", ] [[values]] diff --git a/lib/local-execution/include/local-execution/op_slot_options.enum.toml b/lib/task-spec/include/task-spec/op_slot_options.enum.toml similarity index 100% rename from lib/local-execution/include/local-execution/op_slot_options.enum.toml rename to lib/task-spec/include/task-spec/op_slot_options.enum.toml diff --git a/lib/local-execution/include/local-execution/op_task_invocation.h b/lib/task-spec/include/task-spec/op_task_invocation.h similarity index 86% rename from lib/local-execution/include/local-execution/op_task_invocation.h rename to lib/task-spec/include/task-spec/op_task_invocation.h index 0f351c3a0e..cce0a4d6a6 100644 --- a/lib/local-execution/include/local-execution/op_task_invocation.h +++ b/lib/task-spec/include/task-spec/op_task_invocation.h @@ -2,17 +2,17 @@ #define _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_INVOCATION_H #include "kernels/accessor.h" -#include "local-execution/concrete_arg.h" -#include "local-execution/is_trainable.dtg.h" -#include "local-execution/op_arg_ref.h" -#include "local-execution/op_arg_spec.dtg.h" -#include "local-execution/op_task_signature.h" -#include "local-execution/op_tensor_spec.h" -#include "local-execution/profiling.h" -#include "local-execution/runtime_arg_ref.h" -#include "local-execution/slot_grad_id.dtg.h" -#include "local-execution/task_id_t.dtg.h" -#include "local-execution/variadic_tensor_ref.h" +#include "task-spec/concrete_arg.h" +#include "task-spec/is_trainable.dtg.h" +#include "task-spec/op_arg_ref.h" +#include "task-spec/op_arg_spec.dtg.h" +#include "task-spec/op_task_signature.h" +#include "task-spec/op_tensor_spec.h" +#include "task-spec/profiling.h" +#include "task-spec/runtime_arg_ref.h" +#include "task-spec/slot_grad_id.dtg.h" +#include "task-spec/task_id_t.dtg.h" +#include "task-spec/variadic_tensor_ref.h" #include #include #include diff --git a/lib/local-execution/include/local-execution/op_task_signature.h b/lib/task-spec/include/task-spec/op_task_signature.h similarity index 91% rename from lib/local-execution/include/local-execution/op_task_signature.h rename to lib/task-spec/include/task-spec/op_task_signature.h index 0447644354..eba0023906 100644 --- a/lib/local-execution/include/local-execution/op_task_signature.h +++ b/lib/task-spec/include/task-spec/op_task_signature.h @@ -1,13 +1,13 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_SIGNATURE_H #define _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_SIGNATURE_H -#include "local-execution/is_grad.dtg.h" -#include "local-execution/op_task_type.dtg.h" -#include "local-execution/op_tensor_slot_spec.dtg.h" -#include "local-execution/serialization.h" -#include "local-execution/slot_id_t.dtg.h" -#include "local-execution/slot_type.dtg.h" -#include "local-execution/task_id_t.dtg.h" +#include "task-spec/is_grad.dtg.h" +#include "task-spec/op_task_type.dtg.h" +#include "task-spec/op_tensor_slot_spec.dtg.h" +#include "task-spec/serialization.h" +#include "task-spec/slot_id_t.dtg.h" +#include "task-spec/slot_type.dtg.h" +#include "task-spec/task_id_t.dtg.h" #include "utils/hash/unordered_map.h" #include "utils/hash/unordered_set.h" #include "utils/type_index.h" diff --git a/lib/local-execution/include/local-execution/op_task_to_task_invocation.h b/lib/task-spec/include/task-spec/op_task_to_task_invocation.h similarity index 79% rename from lib/local-execution/include/local-execution/op_task_to_task_invocation.h rename to lib/task-spec/include/task-spec/op_task_to_task_invocation.h index 02b3c938b0..0c5fdb39a4 100644 --- a/lib/local-execution/include/local-execution/op_task_to_task_invocation.h +++ b/lib/task-spec/include/task-spec/op_task_to_task_invocation.h @@ -1,12 +1,12 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_TO_TASK_INVOCATION_H #define _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_TO_TASK_INVOCATION_H -#include "local-execution/device_specific_device_states.dtg.h" -#include "local-execution/op_task_invocation.h" -#include "local-execution/runtime_arg_config.h" -#include "local-execution/task_invocation.dtg.h" #include "pcg/computation_graph.dtg.h" #include "pcg/layer_guid_t.dtg.h" +#include "task-spec/device_specific_device_states.dtg.h" +#include "task-spec/op_task_invocation.h" +#include "task-spec/runtime_arg_config.h" +#include "task-spec/task_invocation.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/op_task_type.enum.toml b/lib/task-spec/include/task-spec/op_task_type.enum.toml similarity index 100% rename from lib/local-execution/include/local-execution/op_task_type.enum.toml rename to lib/task-spec/include/task-spec/op_task_type.enum.toml diff --git a/lib/local-execution/include/local-execution/op_tensor_slot_spec.struct.toml b/lib/task-spec/include/task-spec/op_tensor_slot_spec.struct.toml similarity index 68% rename from lib/local-execution/include/local-execution/op_tensor_slot_spec.struct.toml rename to lib/task-spec/include/task-spec/op_tensor_slot_spec.struct.toml index 590dbe6362..109ddf36af 100644 --- a/lib/local-execution/include/local-execution/op_tensor_slot_spec.struct.toml +++ b/lib/task-spec/include/task-spec/op_tensor_slot_spec.struct.toml @@ -8,11 +8,11 @@ features = [ ] includes = [ - "local-execution/slot_id_t.dtg.h", - "local-execution/slot_type.dtg.h", - "local-execution/tensor_role.dtg.h", - "local-execution/is_grad.dtg.h", - "local-execution/op_slot_options.dtg.h", + "task-spec/slot_id_t.dtg.h", + "task-spec/slot_type.dtg.h", + "task-spec/tensor_role.dtg.h", + "task-spec/is_grad.dtg.h", + "task-spec/op_slot_options.dtg.h", ] [[fields]] diff --git a/lib/local-execution/include/local-execution/op_tensor_spec.h b/lib/task-spec/include/task-spec/op_tensor_spec.h similarity index 92% rename from lib/local-execution/include/local-execution/op_tensor_spec.h rename to lib/task-spec/include/task-spec/op_tensor_spec.h index 29d6cef628..c957704a10 100644 --- a/lib/local-execution/include/local-execution/op_tensor_spec.h +++ b/lib/task-spec/include/task-spec/op_tensor_spec.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TENSOR_SPEC_REF_H #define _FLEXFLOW_LOCAL_EXECUTION_OP_TENSOR_SPEC_REF_H -#include "local-execution/op_task_signature.h" +#include "task-spec/op_task_signature.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/optimizer_tensor_t.struct.toml b/lib/task-spec/include/task-spec/optimizer_tensor_t.struct.toml similarity index 100% rename from lib/local-execution/include/local-execution/optimizer_tensor_t.struct.toml rename to lib/task-spec/include/task-spec/optimizer_tensor_t.struct.toml diff --git a/lib/local-execution/include/local-execution/parallel_tensor_shape_ref_type.struct.toml b/lib/task-spec/include/task-spec/parallel_tensor_shape_ref_type.struct.toml similarity index 100% rename from lib/local-execution/include/local-execution/parallel_tensor_shape_ref_type.struct.toml rename to lib/task-spec/include/task-spec/parallel_tensor_shape_ref_type.struct.toml diff --git a/lib/local-execution/include/local-execution/per_device_op_state.h b/lib/task-spec/include/task-spec/per_device_op_state.h similarity index 71% rename from lib/local-execution/include/local-execution/per_device_op_state.h rename to lib/task-spec/include/task-spec/per_device_op_state.h index 1edd5b6360..23312d90a5 100644 --- a/lib/local-execution/include/local-execution/per_device_op_state.h +++ b/lib/task-spec/include/task-spec/per_device_op_state.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H #define _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H -#include "local-execution/device_specific_device_states.dtg.h" -#include "local-execution/per_device_op_state.dtg.h" +#include "task-spec/device_specific_device_states.dtg.h" +#include "task-spec/per_device_op_state.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/per_device_op_state.variant.toml b/lib/task-spec/include/task-spec/per_device_op_state.variant.toml similarity index 100% rename from lib/local-execution/include/local-execution/per_device_op_state.variant.toml rename to lib/task-spec/include/task-spec/per_device_op_state.variant.toml diff --git a/lib/local-execution/include/local-execution/per_device_op_state_ref_type.struct.toml b/lib/task-spec/include/task-spec/per_device_op_state_ref_type.struct.toml similarity index 100% rename from lib/local-execution/include/local-execution/per_device_op_state_ref_type.struct.toml rename to lib/task-spec/include/task-spec/per_device_op_state_ref_type.struct.toml diff --git a/lib/local-execution/include/local-execution/profiling.h b/lib/task-spec/include/task-spec/profiling.h similarity index 100% rename from lib/local-execution/include/local-execution/profiling.h rename to lib/task-spec/include/task-spec/profiling.h diff --git a/lib/local-execution/include/local-execution/runtime_arg_config.h b/lib/task-spec/include/task-spec/runtime_arg_config.h similarity index 80% rename from lib/local-execution/include/local-execution/runtime_arg_config.h rename to lib/task-spec/include/task-spec/runtime_arg_config.h index 31b3479a14..f4320bc40b 100644 --- a/lib/local-execution/include/local-execution/runtime_arg_config.h +++ b/lib/task-spec/include/task-spec/runtime_arg_config.h @@ -2,8 +2,8 @@ #define _FLEXFLOW_LOCAL_EXECUTION_RUNTIME_ARG_CONFIG_H #include "kernels/ff_handle.h" -#include "local-execution/device_specific.h" -#include "local-execution/profiling.h" +#include "task-spec/device_specific.h" +#include "task-spec/profiling.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/runtime_arg_ref.h b/lib/task-spec/include/task-spec/runtime_arg_ref.h similarity index 81% rename from lib/local-execution/include/local-execution/runtime_arg_ref.h rename to lib/task-spec/include/task-spec/runtime_arg_ref.h index a225a813df..33fccb0106 100644 --- a/lib/local-execution/include/local-execution/runtime_arg_ref.h +++ b/lib/task-spec/include/task-spec/runtime_arg_ref.h @@ -1,10 +1,10 @@ #ifndef _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_RUNTIME_ARG_REF_H #define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_RUNTIME_ARG_REF_H -#include "local-execution/arg_ref.h" -#include "local-execution/config.h" -#include "local-execution/device_specific.h" -#include "local-execution/profiling.h" +#include "task-spec/arg_ref.h" +#include "task-spec/config.h" +#include "task-spec/device_specific.h" +#include "task-spec/profiling.h" #include "utils/fmt.h" #include "utils/type_index.h" diff --git a/lib/local-execution/include/local-execution/serialization.h b/lib/task-spec/include/task-spec/serialization.h similarity index 100% rename from lib/local-execution/include/local-execution/serialization.h rename to lib/task-spec/include/task-spec/serialization.h diff --git a/lib/local-execution/include/local-execution/slot_grad_id.struct.toml b/lib/task-spec/include/task-spec/slot_grad_id.struct.toml similarity index 75% rename from lib/local-execution/include/local-execution/slot_grad_id.struct.toml rename to lib/task-spec/include/task-spec/slot_grad_id.struct.toml index 256091d272..a6533ea884 100644 --- a/lib/local-execution/include/local-execution/slot_grad_id.struct.toml +++ b/lib/task-spec/include/task-spec/slot_grad_id.struct.toml @@ -8,8 +8,8 @@ features = [ ] includes = [ - "local-execution/is_grad.dtg.h", - "local-execution/slot_id_t.dtg.h", + "task-spec/is_grad.dtg.h", + "task-spec/slot_id_t.dtg.h", ] [[fields]] diff --git a/lib/local-execution/include/local-execution/slot_id_t.struct.toml b/lib/task-spec/include/task-spec/slot_id_t.struct.toml similarity index 100% rename from lib/local-execution/include/local-execution/slot_id_t.struct.toml rename to lib/task-spec/include/task-spec/slot_id_t.struct.toml diff --git a/lib/local-execution/include/local-execution/slot_tensor_type_id.struct.toml b/lib/task-spec/include/task-spec/slot_tensor_type_id.struct.toml similarity index 76% rename from lib/local-execution/include/local-execution/slot_tensor_type_id.struct.toml rename to lib/task-spec/include/task-spec/slot_tensor_type_id.struct.toml index b3b3a320c7..ab5b981637 100644 --- a/lib/local-execution/include/local-execution/slot_tensor_type_id.struct.toml +++ b/lib/task-spec/include/task-spec/slot_tensor_type_id.struct.toml @@ -8,8 +8,8 @@ features = [ ] includes = [ - "local-execution/tensor_type.dtg.h", - "local-execution/slot_id_t.dtg.h", + "task-spec/tensor_type.dtg.h", + "task-spec/slot_id_t.dtg.h", ] [[fields]] diff --git a/lib/local-execution/include/local-execution/slot_type.enum.toml b/lib/task-spec/include/task-spec/slot_type.enum.toml similarity index 100% rename from lib/local-execution/include/local-execution/slot_type.enum.toml rename to lib/task-spec/include/task-spec/slot_type.enum.toml diff --git a/lib/local-execution/include/local-execution/task_arg_spec.variant.toml b/lib/task-spec/include/task-spec/task_arg_spec.variant.toml similarity index 77% rename from lib/local-execution/include/local-execution/task_arg_spec.variant.toml rename to lib/task-spec/include/task-spec/task_arg_spec.variant.toml index 271e3b73d6..0f81f93405 100644 --- a/lib/local-execution/include/local-execution/task_arg_spec.variant.toml +++ b/lib/task-spec/include/task-spec/task_arg_spec.variant.toml @@ -7,8 +7,8 @@ features = [ ] includes = [ - "local-execution/concrete_arg.h", - "local-execution/runtime_arg_ref.h" + "task-spec/concrete_arg.h", + "task-spec/runtime_arg_ref.h" ] [[values]] diff --git a/lib/local-execution/include/local-execution/task_binding.h b/lib/task-spec/include/task-spec/task_binding.h similarity index 82% rename from lib/local-execution/include/local-execution/task_binding.h rename to lib/task-spec/include/task-spec/task_binding.h index aba0c01a65..a945fec1d7 100644 --- a/lib/local-execution/include/local-execution/task_binding.h +++ b/lib/task-spec/include/task-spec/task_binding.h @@ -1,15 +1,15 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_BINDING_H #define _FLEXFLOW_LOCAL_EXECUTION_TASK_BINDING_H -#include "local-execution/loss_tensor_t.dtg.h" -#include "local-execution/lowered_tensor_t.dtg.h" -#include "local-execution/optimizer_tensor_t.dtg.h" -#include "local-execution/slot_id_t.dtg.h" -#include "local-execution/slot_tensor_type_id.dtg.h" -#include "local-execution/task_arg_spec.dtg.h" -#include "local-execution/task_id_t.dtg.h" -#include "local-execution/task_signature.dtg.h" -#include "local-execution/tensor_type_t.dtg.h" +#include "task-spec/loss_tensor_t.dtg.h" +#include "task-spec/lowered_tensor_t.dtg.h" +#include "task-spec/optimizer_tensor_t.dtg.h" +#include "task-spec/slot_id_t.dtg.h" +#include "task-spec/slot_tensor_type_id.dtg.h" +#include "task-spec/task_arg_spec.dtg.h" +#include "task-spec/task_id_t.dtg.h" +#include "task-spec/task_signature.dtg.h" +#include "task-spec/tensor_type_t.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/task_id_t.enum.toml b/lib/task-spec/include/task-spec/task_id_t.enum.toml similarity index 100% rename from lib/local-execution/include/local-execution/task_id_t.enum.toml rename to lib/task-spec/include/task-spec/task_id_t.enum.toml diff --git a/lib/local-execution/include/local-execution/task_invocation.h b/lib/task-spec/include/task-spec/task_invocation.h similarity index 81% rename from lib/local-execution/include/local-execution/task_invocation.h rename to lib/task-spec/include/task-spec/task_invocation.h index d03d6ac8e1..85940091a1 100644 --- a/lib/local-execution/include/local-execution/task_invocation.h +++ b/lib/task-spec/include/task-spec/task_invocation.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_INVOCATION_H #define _FLEXFLOW_LOCAL_EXECUTION_TASK_INVOCATION_H -#include "local-execution/task_invocation.dtg.h" +#include "task-spec/task_invocation.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/task_invocation.struct.toml b/lib/task-spec/include/task-spec/task_invocation.struct.toml similarity index 75% rename from lib/local-execution/include/local-execution/task_invocation.struct.toml rename to lib/task-spec/include/task-spec/task_invocation.struct.toml index c9e1e22ba1..38e02a1370 100644 --- a/lib/local-execution/include/local-execution/task_invocation.struct.toml +++ b/lib/task-spec/include/task-spec/task_invocation.struct.toml @@ -7,8 +7,8 @@ features = [ ] includes = [ - "local-execution/task_binding.h", - "local-execution/task_id_t.dtg.h" + "task-spec/task_binding.h", + "task-spec/task_id_t.dtg.h" ] diff --git a/lib/local-execution/include/local-execution/task_signature.h b/lib/task-spec/include/task-spec/task_signature.h similarity index 97% rename from lib/local-execution/include/local-execution/task_signature.h rename to lib/task-spec/include/task-spec/task_signature.h index b10edce6d4..8214e7e1b5 100644 --- a/lib/local-execution/include/local-execution/task_signature.h +++ b/lib/task-spec/include/task-spec/task_signature.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_SIGNATURE_H #define _FLEXFLOW_LOCAL_EXECUTION_TASK_SIGNATURE_H -#include "local-execution/task_signature.dtg.h" +#include "task-spec/task_signature.dtg.h" #include "utils/type_index.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/task_signature.struct.toml b/lib/task-spec/include/task-spec/task_signature.struct.toml similarity index 86% rename from lib/local-execution/include/local-execution/task_signature.struct.toml rename to lib/task-spec/include/task-spec/task_signature.struct.toml index 7efb0c658a..3df0a8cfc7 100644 --- a/lib/local-execution/include/local-execution/task_signature.struct.toml +++ b/lib/task-spec/include/task-spec/task_signature.struct.toml @@ -7,8 +7,8 @@ features = [ ] includes = [ - "local-execution/tensor_type_slot_spec.dtg.h", - "local-execution/slot_id_t.dtg.h", + "task-spec/tensor_type_slot_spec.dtg.h", + "task-spec/slot_id_t.dtg.h", "", "" ] diff --git a/lib/local-execution/include/local-execution/tensor_role.enum.toml b/lib/task-spec/include/task-spec/tensor_role.enum.toml similarity index 100% rename from lib/local-execution/include/local-execution/tensor_role.enum.toml rename to lib/task-spec/include/task-spec/tensor_role.enum.toml diff --git a/lib/local-execution/include/local-execution/tensor_type.enum.toml b/lib/task-spec/include/task-spec/tensor_type.enum.toml similarity index 100% rename from lib/local-execution/include/local-execution/tensor_type.enum.toml rename to lib/task-spec/include/task-spec/tensor_type.enum.toml diff --git a/lib/local-execution/include/local-execution/tensor_type_slot_spec.struct.toml b/lib/task-spec/include/task-spec/tensor_type_slot_spec.struct.toml similarity index 72% rename from lib/local-execution/include/local-execution/tensor_type_slot_spec.struct.toml rename to lib/task-spec/include/task-spec/tensor_type_slot_spec.struct.toml index ceba809474..26e70a5ef8 100644 --- a/lib/local-execution/include/local-execution/tensor_type_slot_spec.struct.toml +++ b/lib/task-spec/include/task-spec/tensor_type_slot_spec.struct.toml @@ -8,9 +8,9 @@ features = [ ] includes = [ - "local-execution/slot_type.dtg.h", - "local-execution/slot_id_t.dtg.h", - "local-execution/tensor_type.dtg.h", + "task-spec/slot_type.dtg.h", + "task-spec/slot_id_t.dtg.h", + "task-spec/tensor_type.dtg.h", ] [[fields]] diff --git a/lib/local-execution/include/local-execution/tensor_type_t.variant.toml b/lib/task-spec/include/task-spec/tensor_type_t.variant.toml similarity index 76% rename from lib/local-execution/include/local-execution/tensor_type_t.variant.toml rename to lib/task-spec/include/task-spec/tensor_type_t.variant.toml index cd3520ee5d..b93ed91081 100644 --- a/lib/local-execution/include/local-execution/tensor_type_t.variant.toml +++ b/lib/task-spec/include/task-spec/tensor_type_t.variant.toml @@ -9,9 +9,9 @@ features = [ includes = [ "pcg/tensor_guid_t.dtg.h", - "local-execution/optimizer_tensor_t.dtg.h", - "local-execution/gradient_tensor_t.dtg.h", - "local-execution/loss_tensor_t.dtg.h" + "task-spec/optimizer_tensor_t.dtg.h", + "task-spec/gradient_tensor_t.dtg.h", + "task-spec/loss_tensor_t.dtg.h" ] [[values]] diff --git a/lib/local-execution/include/local-execution/variadic_tensor_ref.h b/lib/task-spec/include/task-spec/variadic_tensor_ref.h similarity index 81% rename from lib/local-execution/include/local-execution/variadic_tensor_ref.h rename to lib/task-spec/include/task-spec/variadic_tensor_ref.h index 56da1bab64..e990fd5366 100644 --- a/lib/local-execution/include/local-execution/variadic_tensor_ref.h +++ b/lib/task-spec/include/task-spec/variadic_tensor_ref.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_VARIADIC_TENSOR_ARG_REF_H #define _FLEXFLOW_LOCAL_EXECUTION_VARIADIC_TENSOR_ARG_REF_H -#include "local-execution/arg_ref.h" -#include "local-execution/op_tensor_spec.h" +#include "task-spec/arg_ref.h" +#include "task-spec/op_tensor_spec.h" namespace FlexFlow { diff --git a/lib/local-execution/src/concrete_arg.cc b/lib/task-spec/src/concrete_arg.cc similarity index 94% rename from lib/local-execution/src/concrete_arg.cc rename to lib/task-spec/src/concrete_arg.cc index 450d663e17..b67b74b19a 100644 --- a/lib/local-execution/src/concrete_arg.cc +++ b/lib/task-spec/src/concrete_arg.cc @@ -1,4 +1,4 @@ -#include "local-execution/concrete_arg.h" +#include "task-spec/concrete_arg.h" namespace FlexFlow { diff --git a/lib/local-execution/src/op_arg_ref.cc b/lib/task-spec/src/op_arg_ref.cc similarity index 87% rename from lib/local-execution/src/op_arg_ref.cc rename to lib/task-spec/src/op_arg_ref.cc index b3d6e2f1a5..a427117982 100644 --- a/lib/local-execution/src/op_arg_ref.cc +++ b/lib/task-spec/src/op_arg_ref.cc @@ -1,4 +1,4 @@ -#include "local-execution/op_arg_ref.h" +#include "task-spec/op_arg_ref.h" namespace FlexFlow { diff --git a/lib/local-execution/src/op_arg_spec.cc b/lib/task-spec/src/op_arg_spec.cc similarity index 83% rename from lib/local-execution/src/op_arg_spec.cc rename to lib/task-spec/src/op_arg_spec.cc index ddf50d9a4e..6e48a7c5f7 100644 --- a/lib/local-execution/src/op_arg_spec.cc +++ b/lib/task-spec/src/op_arg_spec.cc @@ -1,4 +1,4 @@ -#include "local-execution/op_arg_spec.h" +#include "task-spec/op_arg_spec.h" namespace FlexFlow { diff --git a/lib/local-execution/src/op_task_invocation.cc b/lib/task-spec/src/op_task_invocation.cc similarity index 97% rename from lib/local-execution/src/op_task_invocation.cc rename to lib/task-spec/src/op_task_invocation.cc index 19c8894b05..d495dd9f92 100644 --- a/lib/local-execution/src/op_task_invocation.cc +++ b/lib/task-spec/src/op_task_invocation.cc @@ -1,5 +1,5 @@ -#include "local-execution/op_task_invocation.h" -#include "local-execution/op_arg_spec.h" +#include "task-spec/op_task_invocation.h" +#include "task-spec/op_arg_spec.h" #include "utils/containers/contains_key.h" namespace FlexFlow { diff --git a/lib/local-execution/src/op_task_signature.cc b/lib/task-spec/src/op_task_signature.cc similarity index 99% rename from lib/local-execution/src/op_task_signature.cc rename to lib/task-spec/src/op_task_signature.cc index 932b330453..94ac16d092 100644 --- a/lib/local-execution/src/op_task_signature.cc +++ b/lib/task-spec/src/op_task_signature.cc @@ -1,4 +1,4 @@ -#include "local-execution/op_task_signature.h" +#include "task-spec/op_task_signature.h" #include "utils/fmt/optional.h" #include "utils/fmt/unordered_map.h" #include "utils/fmt/unordered_set.h" diff --git a/lib/local-execution/src/op_task_to_task_invocation.cc b/lib/task-spec/src/op_task_to_task_invocation.cc similarity index 98% rename from lib/local-execution/src/op_task_to_task_invocation.cc rename to lib/task-spec/src/op_task_to_task_invocation.cc index 0e04a2adec..f52800a8de 100644 --- a/lib/local-execution/src/op_task_to_task_invocation.cc +++ b/lib/task-spec/src/op_task_to_task_invocation.cc @@ -1,4 +1,4 @@ -#include "local-execution/op_task_to_task_invocation.h" +#include "task-spec/op_task_to_task_invocation.h" #include "op-attrs/parallel_tensor_shape.h" #include "pcg/computation_graph.h" diff --git a/lib/local-execution/src/op_tensor_spec.cc b/lib/task-spec/src/op_tensor_spec.cc similarity index 89% rename from lib/local-execution/src/op_tensor_spec.cc rename to lib/task-spec/src/op_tensor_spec.cc index a3d3e7ddac..1d97e6ae16 100644 --- a/lib/local-execution/src/op_tensor_spec.cc +++ b/lib/task-spec/src/op_tensor_spec.cc @@ -1,4 +1,4 @@ -#include "local-execution/op_tensor_spec.h" +#include "task-spec/op_tensor_spec.h" namespace FlexFlow { diff --git a/lib/local-execution/src/runtime_arg_ref.cc b/lib/task-spec/src/runtime_arg_ref.cc similarity index 89% rename from lib/local-execution/src/runtime_arg_ref.cc rename to lib/task-spec/src/runtime_arg_ref.cc index 1f591b4d82..bb4625c113 100644 --- a/lib/local-execution/src/runtime_arg_ref.cc +++ b/lib/task-spec/src/runtime_arg_ref.cc @@ -1,5 +1,5 @@ -#include "local-execution/runtime_arg_ref.h" -#include "local-execution/device_specific.h" +#include "task-spec/runtime_arg_ref.h" +#include "task-spec/device_specific.h" namespace FlexFlow { diff --git a/lib/local-execution/src/task_invocation.cc b/lib/task-spec/src/task_invocation.cc similarity index 77% rename from lib/local-execution/src/task_invocation.cc rename to lib/task-spec/src/task_invocation.cc index e08c1036da..4ba97f26de 100644 --- a/lib/local-execution/src/task_invocation.cc +++ b/lib/task-spec/src/task_invocation.cc @@ -1,4 +1,4 @@ -#include "local-execution/task_invocation.h" +#include "task-spec/task_invocation.h" namespace FlexFlow { diff --git a/lib/local-execution/src/task_signature.cc b/lib/task-spec/src/task_signature.cc similarity index 93% rename from lib/local-execution/src/task_signature.cc rename to lib/task-spec/src/task_signature.cc index 1d57a1fc54..3ac038e8c5 100644 --- a/lib/local-execution/src/task_signature.cc +++ b/lib/task-spec/src/task_signature.cc @@ -1,4 +1,4 @@ -#include "local-execution/task_signature.h" +#include "task-spec/task_signature.h" namespace FlexFlow { diff --git a/lib/local-execution/src/variadic_tensor_ref.cc b/lib/task-spec/src/variadic_tensor_ref.cc similarity index 75% rename from lib/local-execution/src/variadic_tensor_ref.cc rename to lib/task-spec/src/variadic_tensor_ref.cc index efd43a6648..564e58ba95 100644 --- a/lib/local-execution/src/variadic_tensor_ref.cc +++ b/lib/task-spec/src/variadic_tensor_ref.cc @@ -1,4 +1,4 @@ -#include "local-execution/variadic_tensor_ref.h" +#include "task-spec/variadic_tensor_ref.h" namespace FlexFlow { From 639c2c1c6b2cd3efe76e2c62b6925e9bcc24b817 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 5 Feb 2025 18:10:59 -0800 Subject: [PATCH 39/91] Delete outdated sim environment file --- .../include/local-execution/sim_environment.h | 119 ------------------ 1 file changed, 119 deletions(-) delete mode 100644 lib/local-execution/include/local-execution/sim_environment.h diff --git a/lib/local-execution/include/local-execution/sim_environment.h b/lib/local-execution/include/local-execution/sim_environment.h deleted file mode 100644 index 6c2f8d4ebb..0000000000 --- a/lib/local-execution/include/local-execution/sim_environment.h +++ /dev/null @@ -1,119 +0,0 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_SIM_ENVIRONMENT_H -#define _FLEXFLOW_LOCAL_EXECUTION_SIM_ENVIRONMENT_H - -#include "kernels/accessor.h" -#include "kernels/allocation.h" -#include "local-execution/cost_metrics.h" -#include "local-execution/task_argument_accessor.h" -#include "local-execution/task_signature_impl.h" -#include "op-attrs/parallel_tensor_shape.dtg.h" -#include "pcg/machine_view.h" -#include "task-spec/op_task_invocation.h" -#include - -namespace FlexFlow { - -struct InputParallelTensorDesc { -public: - ParallelTensorShape shape; - IsTrainable trainable; -}; - -struct InputVariadicParallelTensorDesc { -public: - std::vector shapes; - IsTrainable trainable; -}; - -struct SimTaskBinding { - void bind(slot_id_t, ParallelTensorShape const &); - void bind_untrainable(slot_id_t, ParallelTensorShape const &); - void bind(slot_id_t, ParallelTensorShape const &, IsTrainable); - void bind(slot_id_t, InputParallelTensorDesc const &); - - void bind(slot_id_t, std::vector const &); - void bind_untrainable(slot_id_t, std::vector const &); - void bind(slot_id_t, std::vector const &, IsTrainable); - void bind(slot_id_t, InputVariadicParallelTensorDesc const &); - - template - void bind_arg(slot_id_t, T const &); -}; - -SimTaskBinding infer_bwd_binding(SimTaskBinding const &); - -struct SimEnvironment { - TaskArgumentAccessor get_init_accessor(task_id_t, SimTaskBinding const &); - TaskArgumentAccessor get_fwd_accessor(task_id_t, SimTaskBinding const &); - TaskArgumentAccessor get_bwd_accessor(task_id_t, SimTaskBinding const &); -}; - -struct SimEnvFactory { - SimEnvironment new_environment() const; -}; - -GenericTensorAccessorW allocate_input(SimEnvironment &sim, TensorShape const &); -GenericTensorAccessorW allocate_input(SimEnvironment &sim, - ParallelTensorShape const &); -std::vector - allocate_input(SimEnvironment &sim, - std::vector const &); - -GenericTensorAccessorW allocate_weight(SimEnvironment &sim, - TensorShape const &); -GenericTensorAccessorW allocate_weight(SimEnvironment &sim, - ParallelTensorShape const &); -std::vector - allocate_weight(SimEnvironment &sim, - std::vector const &); - -GenericTensorAccessorW allocate_output(SimEnvironment &sim, - TensorShape const &); -GenericTensorAccessorW allocate_output(SimEnvironment &sim, - ParallelTensorShape const &); -std::vector - allocate_output(SimEnvironment &sim, - std::vector const &); - -GenericTensorAccessorW allocate_input_grad(SimEnvironment &sim, - TensorShape const &); -GenericTensorAccessorW allocate_input_grad(SimEnvironment &sim, - ParallelTensorShape const &); -std::vector - allocate_input_grad(SimEnvironment &sim, - std::vector const &); - -GenericTensorAccessorW allocate_weight_grad(SimEnvironment &sim, - TensorShape const &); -GenericTensorAccessorW allocate_weight_grad(SimEnvironment &sim, - ParallelTensorShape const &); -std::vector - allocate_weight_grad(SimEnvironment &sim, - std::vector const &); - -GenericTensorAccessorW allocate_output_grad(SimEnvironment &sim, - TensorShape const &); -GenericTensorAccessorW allocate_output_grad(SimEnvironment &sim, - ParallelTensorShape const &); -std::vector - allocate_output_grad(SimEnvironment &sim, - std::vector const &); - -Allocator create_allocator(SimEnvironment &sim); -PerDeviceFFHandle get_ff_handle(SimEnvironment &sim); - -size_t get_input_memory_usage(SimEnvironment const &); -size_t get_output_memory_usage(SimEnvironment const &); -size_t get_weights_memory_usage(SimEnvironment const &); -size_t get_op_total_memory(SimEnvironment const &); - -CostMetrics make_metrics(float forward_time, - float backward_time, - float sync_time, - SimEnvironment const &); - -float default_estimate_sync_time(SimEnvironment const &); - -} // namespace FlexFlow - -#endif From a697044c4bbecae53357114d4d9c8cae12f46793 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Thu, 13 Feb 2025 07:06:35 -0800 Subject: [PATCH 40/91] Finish API --- .../local-execution/allocated_tensors.h | 23 ++ .../allocated_tensors.struct.toml | 30 ++ .../local-execution/local_args_backing.h | 11 +- .../local-execution/local_tensor_backing.h | 67 ++-- .../local-execution/local_training_backing.h | 36 +- .../local-execution/model_training_instance.h | 15 +- .../include/local-execution/task_registry.h | 9 +- .../unallocated_tensors.struct.toml | 30 ++ lib/local-execution/src/allocated_tensors.cc | 128 +++++++ lib/local-execution/src/local_args_backing.cc | 17 +- .../src/local_tensor_backing.cc | 315 ++++++++---------- .../src/local_training_backing.cc | 173 ++++++---- .../src/model_training_instance.cc | 27 +- lib/local-execution/src/task_registry.cc | 81 +++-- lib/pcg/include/pcg/computation_graph.h | 4 + lib/pcg/src/pcg/computation_graph.cc | 7 + .../task-spec/op_task_to_task_invocation.h | 7 +- .../src/op_task_to_task_invocation.cc | 23 +- 18 files changed, 586 insertions(+), 417 deletions(-) create mode 100644 lib/local-execution/include/local-execution/allocated_tensors.h create mode 100644 lib/local-execution/include/local-execution/allocated_tensors.struct.toml create mode 100644 lib/local-execution/include/local-execution/unallocated_tensors.struct.toml create mode 100644 lib/local-execution/src/allocated_tensors.cc diff --git a/lib/local-execution/include/local-execution/allocated_tensors.h b/lib/local-execution/include/local-execution/allocated_tensors.h new file mode 100644 index 0000000000..60ee662ba8 --- /dev/null +++ b/lib/local-execution/include/local-execution/allocated_tensors.h @@ -0,0 +1,23 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_ALLOCATED_TENSORS_H +#define _FLEXFLOW_LOCAL_EXECUTION_ALLOCATED_TENSORS_H + +#include "local-execution/allocated_tensors.dtg.h" +#include "pcg/computation_graph.h" + +namespace FlexFlow { + +bool are_allocated_forward_tensors_valid(AllocatedTensors const &, + ComputationGraph const &); +bool are_allocated_gradient_tensors_valid(AllocatedTensors const &, + ComputationGraph const &); +bool are_allocated_optimizer_tensors_valid(AllocatedTensors const &, + ComputationGraph const &); + +bool is_allocated_tensor_backing_valid( + TensorTypeVariant const &, + std::unordered_map const &, + ArrayShape const &); + +} // namespace FlexFlow + +#endif diff --git a/lib/local-execution/include/local-execution/allocated_tensors.struct.toml b/lib/local-execution/include/local-execution/allocated_tensors.struct.toml new file mode 100644 index 0000000000..e4be709709 --- /dev/null +++ b/lib/local-execution/include/local-execution/allocated_tensors.struct.toml @@ -0,0 +1,30 @@ +namespace = "FlexFlow" +name = "AllocatedTensors" +features = [ + "eq", + "fmt", + "hash", + "ord" +] + +includes = [ + "task-spec/tensor_type_t.dtg.h", + "kernels/accessor.h" +] + +src_includes = [ + "utils/hash/unordered_map.h", + "utils/fmt/unordered_map.h" +] + +[[fields]] +name = "tensor_type_backings" +type = "std::unordered_map<::FlexFlow::TensorTypeVariant, ::FlexFlow::GenericTensorAccessorW>" + +[[fields]] +name = "gradient_mapping" +type = "std::unordered_map<::FlexFlow::tensor_guid_t, ::FlexFlow::gradient_tensor_t>" + +[[fields]] +name = "optimizer_mapping" +type = "std::unordered_map<::FlexFlow::tensor_guid_t, std::vector<::FlexFlow::optimizer_tensor_t>>" diff --git a/lib/local-execution/include/local-execution/local_args_backing.h b/lib/local-execution/include/local-execution/local_args_backing.h index 4c9ede54fd..e9044dc6fa 100644 --- a/lib/local-execution/include/local-execution/local_args_backing.h +++ b/lib/local-execution/include/local-execution/local_args_backing.h @@ -12,18 +12,19 @@ namespace FlexFlow { struct LocalArgsBacking { - LocalArgsBacking(RuntimeArgConfig const &); + LocalArgsBacking( + RuntimeArgConfig const &, + std::unordered_map const &); public: // arguments + RuntimeArgConfig runtime_arg_config; std::unordered_map per_device_op_states; - RuntimeArgConfig runtime_arg_config; }; -void add_per_device_op_state(LocalArgsBacking &, - layer_guid_t const &, - DeviceSpecificDeviceStates const &); +LocalArgsBacking + make_args_backing_with_empty_device_states(RuntimeArgConfig const &); std::optional get_per_device_op_state_if_exists(LocalArgsBacking const &, diff --git a/lib/local-execution/include/local-execution/local_tensor_backing.h b/lib/local-execution/include/local-execution/local_tensor_backing.h index 9d35373784..86244eab13 100644 --- a/lib/local-execution/include/local-execution/local_tensor_backing.h +++ b/lib/local-execution/include/local-execution/local_tensor_backing.h @@ -3,22 +3,19 @@ #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TENSOR_BACKING_H #include "kernels/accessor.h" +#include "local-execution/allocated_tensors.dtg.h" #include "local-execution/gradient_tensor_source.h" #include "local-execution/local_task_argument_accessor.h" #include "local-execution/loss_tensor_source.h" #include "local-execution/lowered_tensor_source.h" #include "local-execution/optimizer_tensor_source.h" -#include "op-attrs/tensor_shape.dtg.h" +#include "local-execution/unallocated_tensors.dtg.h" #include "pcg/computation_graph.dtg.h" #include "pcg/layer_guid_t.dtg.h" #include "pcg/optimizer_attrs.dtg.h" -#include "pcg/tensor_guid_t.dtg.h" -#include "task-spec/loss_tensor_t.dtg.h" #include "task-spec/lowered_tensor_t.dtg.h" -#include "task-spec/optimizer_tensor_t.dtg.h" #include "task-spec/task_invocation.dtg.h" #include "task-spec/tensor_role.dtg.h" -#include "task-spec/tensor_type_t.dtg.h" namespace FlexFlow { @@ -26,18 +23,12 @@ using TensorBackingMap = std::unordered_map; struct LocalTensorBacking { - LocalTensorBacking() = default; - LocalTensorBacking( - std::unordered_map const - &allocated_tensor_backings, - std::unordered_set const &allocated_tensor_guids, - std::unordered_map const - &allocated_gradient_mapping, - std::unordered_map> const - &allocated_optimizer_mapping, - std::unordered_set const &allocated_loss_tensors); - - lowered_tensor_t allocate_tensor(TensorShape const &, Allocator &); + LocalTensorBacking(AllocatedTensors const &, + UnallocatedTensors const &, + Allocator const &); + +public: + GenericTensorAccessorW get_tensor(TensorTypeVariant const &) const; public: // tensors @@ -55,39 +46,23 @@ struct LocalTensorBacking { std::unordered_map> tensor_optimizer_mapping; + Allocator allocator; + private: - lowered_tensor_t insert_tensor(GenericTensorAccessorW const &); + lowered_tensor_t insert_tensor(TensorTypeVariant const &); LoweredTensorSource lowered_tensor_source; }; -void allocate_tensor_guid(LocalTensorBacking &, - tensor_guid_t const &, - TensorShape const &, - Allocator &); -void allocate_gradient_tensor(LocalTensorBacking &, - gradient_tensor_t const &, - tensor_guid_t const &, - TensorShape const &, - Allocator &); -void allocate_optimizer_tensors(LocalTensorBacking &, - std::vector const &, - tensor_guid_t const &, - TensorShape const &, - Allocator &); - -void allocate_all_computation_graph_tensors(LocalTensorBacking &, - GradientTensorSource &, - ComputationGraph const &, - Allocator &); -void allocate_all_optimizer_tensors(LocalTensorBacking &, - OptimizerTensorSource &, - ComputationGraph const &, - Allocator &, - OptimizerAttrs const &); -loss_tensor_t allocate_loss_tensor(LocalTensorBacking &, - LossTensorSource const &, - TensorShape const &, - Allocator &); +UnallocatedTensors generate_unallocated_tensors(AllocatedTensors const &, + ComputationGraph const &, + GradientTensorSource &); + +UnallocatedTensors + generate_unallocated_tensors_with_optimizer(AllocatedTensors const &, + ComputationGraph const &, + GradientTensorSource &, + OptimizerTensorSource &, + OptimizerAttrs const &); TensorSlotsBacking construct_tensor_slots_backing(LocalTensorBacking const &, TaskBinding const &); diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h index ef5e7ec41e..b61d20c232 100644 --- a/lib/local-execution/include/local-execution/local_training_backing.h +++ b/lib/local-execution/include/local-execution/local_training_backing.h @@ -1,6 +1,7 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TRAINING_BACKING_H #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TRAINING_BACKING_H +#include "local-execution/allocated_tensors.dtg.h" #include "local-execution/local_args_backing.h" #include "local-execution/local_tensor_backing.h" #include "local-execution/optimizer_tensor_source.h" @@ -13,46 +14,51 @@ namespace FlexFlow { struct LocalTrainingBacking { LocalTrainingBacking(Allocator const &, + AllocatedTensors const &, ComputationGraph const &, - LocalTensorBacking const &, - LocalArgsBacking const &); + RuntimeArgConfig const &); + + LocalTrainingBacking(Allocator const &, + AllocatedTensors const &, + ComputationGraph const &, + RuntimeArgConfig const &, + OptimizerAttrs const &); public: LocalTensorBacking local_tensor_backing; LocalArgsBacking local_args_backing; - Allocator allocator; ComputationGraph computation_graph; TaskRegistry task_registry; GradientTensorSource gradient_tensor_source; + OptimizerTensorSource optimizer_tensor_source; }; -DeviceSpecificDeviceStates call_init_task_impl(TaskRegistry const &, - task_id_t task_id, - TaskArgumentAccessor const &acc); +LocalArgsBacking initialize_args_backing(TaskRegistry const &, + ComputationGraph const &, + RuntimeArgConfig const &, + LocalTensorBacking const &); std::optional call_task_impl(TaskRegistry const &, - task_id_t task_id, - TaskArgumentAccessor acc); + task_id_t const &task_id, + TaskArgumentAccessor const &acc); -void execute_init(LocalTrainingBacking &, layer_guid_t const &); -std::optional execute_forward(LocalTrainingBacking &, +std::optional execute_forward(LocalTrainingBacking const &, layer_guid_t const &); -std::optional execute_backward(LocalTrainingBacking &, +std::optional execute_backward(LocalTrainingBacking const &, layer_guid_t const &); -void compute_loss(LocalTrainingBacking &, +void compute_loss(LocalTrainingBacking const &, LossAttrs const &, tensor_guid_t const &logit_tensor, loss_tensor_t const &label_tensor); -void execute_update(LocalTrainingBacking &, +void execute_update(LocalTrainingBacking const &, layer_guid_t const &, OptimizerAttrs const &); TaskArgumentAccessor get_task_arg_accessor(LocalTensorBacking const &, LocalArgsBacking const &, - TaskInvocation const &, - Allocator &); + TaskInvocation const &); } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h index c264418abc..99a1bd5a9a 100644 --- a/lib/local-execution/include/local-execution/model_training_instance.h +++ b/lib/local-execution/include/local-execution/model_training_instance.h @@ -14,24 +14,19 @@ using PerLayerElapsedTime = struct ModelTrainingInstance { ModelTrainingInstance(LocalTrainingBacking const &, tensor_guid_t const &logit_tensor, - TensorShape const &label_tensor_shape, + loss_tensor_t const &label_tensor, LossAttrs const &, OptimizerAttrs const &); LocalTrainingBacking training_backing; - LossAttrs loss_attrs; - OptimizerAttrs optimizer_attrs; tensor_guid_t logit_tensor; loss_tensor_t label_tensor; - -private: - OptimizerTensorSource optimizer_tensor_source; - LossTensorSource loss_tensor_source; + LossAttrs loss_attrs; + OptimizerAttrs optimizer_attrs; }; -void init(ModelTrainingInstance &); -PerLayerElapsedTime forward(ModelTrainingInstance &); -PerLayerElapsedTime backward(ModelTrainingInstance &); +PerLayerElapsedTime forward(ModelTrainingInstance const &); +PerLayerElapsedTime backward(ModelTrainingInstance const &); void update(ModelTrainingInstance &); } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/task_registry.h b/lib/local-execution/include/local-execution/task_registry.h index 22cc344b3d..56e98ba8da 100644 --- a/lib/local-execution/include/local-execution/task_registry.h +++ b/lib/local-execution/include/local-execution/task_registry.h @@ -9,19 +9,12 @@ namespace FlexFlow { -TaskRegistry empty_task_registry(); - -void register_tasks_for_layer(TaskRegistry &, - layer_guid_t const &, - ComputationGraphOpAttrs const &attrs); +TaskRegistry construct_task_registry(ComputationGraph const &); bool registry_contains_task_for_layer(TaskRegistry const &, layer_guid_t const &, OpTaskType const &); -void register_all_computation_graph_tasks(TaskRegistry &, - ComputationGraph const &); - } // namespace FlexFlow #endif diff --git a/lib/local-execution/include/local-execution/unallocated_tensors.struct.toml b/lib/local-execution/include/local-execution/unallocated_tensors.struct.toml new file mode 100644 index 0000000000..87abf83d13 --- /dev/null +++ b/lib/local-execution/include/local-execution/unallocated_tensors.struct.toml @@ -0,0 +1,30 @@ +namespace = "FlexFlow" +name = "UnallocatedTensors" +features = [ + "eq", + "fmt", + "hash", + "ord" +] + +includes = [ + "task-spec/tensor_type_t.dtg.h", + "op-attrs/tensor_shape.dtg.h" +] + +src_includes = [ + "utils/hash/unordered_map.h", + "utils/fmt/unordered_map.h" +] + +[[fields]] +name = "tensor_type_shapes" +type = "std::unordered_map<::FlexFlow::TensorTypeVariant, ::FlexFlow::TensorShape>" + +[[fields]] +name = "gradient_mapping" +type = "std::unordered_map<::FlexFlow::tensor_guid_t, ::FlexFlow::gradient_tensor_t>" + +[[fields]] +name = "optimizer_mapping" +type = "std::unordered_map<::FlexFlow::tensor_guid_t, std::vector<::FlexFlow::optimizer_tensor_t>>" diff --git a/lib/local-execution/src/allocated_tensors.cc b/lib/local-execution/src/allocated_tensors.cc new file mode 100644 index 0000000000..e64db0cfff --- /dev/null +++ b/lib/local-execution/src/allocated_tensors.cc @@ -0,0 +1,128 @@ +#include "local-execution/allocated_tensors.h" +#include "pcg/optimizer_attrs.h" +#include "utils/containers/keys.h" +#include "utils/containers/set_union.h" + +namespace FlexFlow { + +bool is_allocated_tensor_backing_valid( + TensorTypeVariant const &tensor_type, + std::unordered_map const + &allocated_tensor_backings, + ArrayShape const &expected_shape) { + if (allocated_tensor_backings.count(tensor_type)) { + GenericTensorAccessorW tensor_backing = + allocated_tensor_backings.at(tensor_type); + if (expected_shape == tensor_backing.shape) { + return true; + } + } + return false; +}; + +bool are_allocated_forward_tensors_valid( + AllocatedTensors const &allocated_tensors, + ComputationGraph const &computation_graph) { + std::unordered_set all_tensor_guids = + set_union(keys(allocated_tensors.gradient_mapping), + keys(allocated_tensors.optimizer_mapping)); + for (tensor_guid_t const &tensor_guid : all_tensor_guids) { + TensorAttrs expected_tensor_attrs = + get_tensor_attrs(computation_graph, tensor_guid); + if (!is_allocated_tensor_backing_valid( + TensorTypeVariant{tensor_guid}, + allocated_tensors.tensor_type_backings, + ArrayShape{expected_tensor_attrs.shape})) { + return false; + } + } + return true; +} + +bool are_allocated_gradient_tensors_valid( + AllocatedTensors const &allocated_tensors, + ComputationGraph const &computation_graph) { + std::unordered_set + tensors_in_mappings; // will check whether any dangling gradient tensors + // were allocated + + for (std::pair const &tensor_to_grad : + allocated_tensors.gradient_mapping) { + TensorAttrs expected_tensor_attrs = + get_tensor_attrs(computation_graph, tensor_to_grad.first); + if (expected_tensor_attrs.create_gradients == CreateGrad::NO) { + return false; + } + + ArrayShape tensor_guid_array_shape = + allocated_tensors.tensor_type_backings + .at(TensorTypeVariant{tensor_to_grad.first}) + .shape; + TensorTypeVariant gradient_tensor = + TensorTypeVariant{tensor_to_grad.second}; + if (is_allocated_tensor_backing_valid( + gradient_tensor, + allocated_tensors.tensor_type_backings, + tensor_guid_array_shape)) { + tensors_in_mappings.insert(gradient_tensor); + } else { + return false; + } + } + + for (TensorTypeVariant const &tensor_type : + keys(allocated_tensors.tensor_type_backings)) { + if (tensor_type.has()) { + if (!tensors_in_mappings.count(tensor_type)) { + return false; + } + } + } + return true; +} + +bool are_allocated_optimizer_tensors_valid( + AllocatedTensors const &allocated_tensors, + ComputationGraph const &computation_graph) { + std::unordered_set + tensors_in_mappings; // will check whether any dangling optimizer tensors + // were allocated + + for (std::pair> const + &tensor_to_optimizers : allocated_tensors.optimizer_mapping) { + TensorAttrs expected_tensor_attrs = + get_tensor_attrs(computation_graph, tensor_to_optimizers.first); + if (expected_tensor_attrs.create_gradients == CreateGrad::NO) { + return false; + } + + ArrayShape tensor_guid_array_shape = + allocated_tensors.tensor_type_backings + .at(TensorTypeVariant{tensor_to_optimizers.first}) + .shape; + for (optimizer_tensor_t const &optimizer_tensor : + tensor_to_optimizers.second) { + if (is_allocated_tensor_backing_valid( + TensorTypeVariant{optimizer_tensor}, + allocated_tensors.tensor_type_backings, + tensor_guid_array_shape)) { + tensors_in_mappings.insert(TensorTypeVariant{optimizer_tensor}); + } else { + return false; + } + } + } + + for (TensorTypeVariant const &tensor_type : + keys(allocated_tensors.tensor_type_backings)) { + if (tensor_type.has()) { + if (!tensors_in_mappings.count(tensor_type)) { + return false; + } + } + } + + return true; +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/local_args_backing.cc b/lib/local-execution/src/local_args_backing.cc index 715a96efa6..4a342767b2 100644 --- a/lib/local-execution/src/local_args_backing.cc +++ b/lib/local-execution/src/local_args_backing.cc @@ -7,15 +7,18 @@ namespace FlexFlow { -LocalArgsBacking::LocalArgsBacking(RuntimeArgConfig const &runtime_arg_config) - : runtime_arg_config(runtime_arg_config){}; - -void add_per_device_op_state(LocalArgsBacking &local_args_backing, - layer_guid_t const &op_guid, - DeviceSpecificDeviceStates const &device_state) { - local_args_backing.per_device_op_states.insert({op_guid, device_state}); +LocalArgsBacking make_args_backing_with_empty_device_states( + RuntimeArgConfig const &runtime_arg_config) { + return LocalArgsBacking{runtime_arg_config, {}}; } +LocalArgsBacking::LocalArgsBacking( + RuntimeArgConfig const &runtime_arg_config, + std::unordered_map const + &device_states) + : runtime_arg_config(runtime_arg_config), + per_device_op_states(device_states){}; + std::optional get_per_device_op_state_if_exists( LocalArgsBacking const &local_args_backing, layer_guid_t const &layer_guid) { diff --git a/lib/local-execution/src/local_tensor_backing.cc b/lib/local-execution/src/local_tensor_backing.cc index 00c170d501..67bbd59c3b 100644 --- a/lib/local-execution/src/local_tensor_backing.cc +++ b/lib/local-execution/src/local_tensor_backing.cc @@ -1,6 +1,7 @@ #include "local-execution/local_tensor_backing.h" #include "task-spec/slot_grad_id.dtg.h" +#include "local-execution/allocated_tensors.h" #include "op-attrs/parallel_tensor_shape.h" #include "pcg/computation_graph.h" #include "pcg/optimizer_attrs.h" @@ -11,195 +12,177 @@ namespace FlexFlow { LocalTensorBacking::LocalTensorBacking( - std::unordered_map const - &allocated_tensor_backings, - std::unordered_set const &allocated_tensor_guids, - std::unordered_map const - &allocated_gradient_mapping, - std::unordered_map> const - &allocated_optimizer_mapping, - std::unordered_set const &allocated_loss_tensors) - : tensor_gradient_mapping(allocated_gradient_mapping), - tensor_optimizer_mapping(allocated_optimizer_mapping) { - - // computation graph tensors - for (tensor_guid_t const &allocated_tensor_guid : allocated_tensor_guids) { - lowered_tensor_t lowered_tensor = this->insert_tensor( - allocated_tensor_backings.at(TensorTypeVariant{allocated_tensor_guid})); - this->tensor_lowering_mapping.insert( - {allocated_tensor_guid, lowered_tensor}); - } - - // gradient tensors - for (std::pair const - &tensor_guid_gradient_pair : allocated_gradient_mapping) { - gradient_tensor_t allocated_gradient_tensor = - tensor_guid_gradient_pair.second; + AllocatedTensors const &allocated_tensors, + UnallocatedTensors const &unallocated_tensors, + Allocator const &allocator) + : tensor_gradient_mapping(allocated_tensors.gradient_mapping), + tensor_optimizer_mapping(allocated_tensors.optimizer_mapping), + allocator(allocator) { + + // handle already-allocated tensors + for (std::pair const + &tensor_type_backing : allocated_tensors.tensor_type_backings) { lowered_tensor_t lowered_tensor = - this->insert_tensor(allocated_tensor_backings.at( - TensorTypeVariant{allocated_gradient_tensor})); - this->gradient_tensor_lowering_mapping.insert( - {allocated_gradient_tensor, lowered_tensor}); + this->insert_tensor(tensor_type_backing.first); + this->tensor_backings.insert({lowered_tensor, tensor_type_backing.second}); } - // optimizer tensors + // allocate new tensors + this->tensor_gradient_mapping.insert( + unallocated_tensors.gradient_mapping.begin(), + unallocated_tensors.gradient_mapping.end()); + for (std::pair> const - &tensor_guid_optimizers_pair : allocated_optimizer_mapping) { - for (optimizer_tensor_t const &allocated_optimizer_tensor : - tensor_guid_optimizers_pair.second) { - lowered_tensor_t lowered_tensor = - this->insert_tensor(allocated_tensor_backings.at( - TensorTypeVariant{allocated_optimizer_tensor})); - this->optimizer_tensor_lowering_mapping.insert( - {allocated_optimizer_tensor, lowered_tensor}); + &unallocated_optimizer_tensors : + unallocated_tensors.optimizer_mapping) { + if (this->tensor_optimizer_mapping.count( + unallocated_optimizer_tensors.first)) { + for (optimizer_tensor_t const &optimizer_tensor : + unallocated_optimizer_tensors.second) { + this->tensor_optimizer_mapping[unallocated_optimizer_tensors.first] + .push_back(optimizer_tensor); + } + } else { + this->tensor_optimizer_mapping.insert({unallocated_optimizer_tensors}); } } - // loss tensors - for (loss_tensor_t const &allocated_loss_tensor : allocated_loss_tensors) { - lowered_tensor_t lowered_tensor = this->insert_tensor( - allocated_tensor_backings.at(TensorTypeVariant{allocated_loss_tensor})); - this->loss_tensor_lowering_mapping.insert( - {allocated_loss_tensor, lowered_tensor}); + for (std::pair const &tensor_type_shape : + unallocated_tensors.tensor_type_shapes) { + lowered_tensor_t lowered_tensor = + this->insert_tensor(tensor_type_shape.first); + GenericTensorAccessorW tensor_backing = + this->allocator.allocate_tensor(tensor_type_shape.second); + this->tensor_backings.insert({lowered_tensor, tensor_backing}); } - - // sanity check that backings match up with the mappings - assert(this->tensor_backings.size() == allocated_tensor_backings.size()); }; -lowered_tensor_t LocalTensorBacking::insert_tensor( - GenericTensorAccessorW const &tensor_backing) { +lowered_tensor_t + LocalTensorBacking::insert_tensor(TensorTypeVariant const &tensor_type) { lowered_tensor_t lowered_tensor = this->lowered_tensor_source.new_lowered_tensor(); - this->tensor_backings.insert({lowered_tensor, tensor_backing}); + tensor_type.visit(overload{ + [&](tensor_guid_t const &tensor_guid) { + this->tensor_lowering_mapping.insert({tensor_guid, lowered_tensor}); + }, + [&](gradient_tensor_t const &gradient_tensor) { + this->gradient_tensor_lowering_mapping.insert( + {gradient_tensor, lowered_tensor}); + }, + [&](optimizer_tensor_t const &optimizer_tensor) { + this->optimizer_tensor_lowering_mapping.insert( + {optimizer_tensor, lowered_tensor}); + }, + [&](loss_tensor_t const &loss_tensor) { + this->loss_tensor_lowering_mapping.insert( + {loss_tensor, lowered_tensor}); + }, + [&](auto const &any_tensor) { + throw mk_runtime_error( + fmt::format("Unhandled tensor type {}", any_tensor)); + }}); return lowered_tensor; } -lowered_tensor_t - LocalTensorBacking::allocate_tensor(TensorShape const &tensor_shape, - Allocator &allocator) { - GenericTensorAccessorW tensor_backing = - allocator.allocate_tensor(tensor_shape); - return this->insert_tensor(tensor_backing); +GenericTensorAccessorW + LocalTensorBacking::get_tensor(TensorTypeVariant const &tensor_type) const { + lowered_tensor_t lowered_tensor = tensor_type.visit( + overload{[&](tensor_guid_t const &tensor_guid) { + this->tensor_lowering_mapping.at(tensor_guid); + }, + [&](gradient_tensor_t const &gradient_tensor) { + this->gradient_tensor_lowering_mapping.at(gradient_tensor); + }, + [&](optimizer_tensor_t const &optimizer_tensor) { + this->optimizer_tensor_lowering_mapping.at(optimizer_tensor); + }, + [&](loss_tensor_t const &loss_tensor) { + this->loss_tensor_lowering_mapping.at(loss_tensor); + }, + [&](auto const &any_tensor) { + throw mk_runtime_error( + fmt::format("Unhandled tensor type {}", any_tensor)); + }}); + return this->tensor_backings.at(lowered_tensor); } -void allocate_tensor_guid(LocalTensorBacking &local_tensor_backing, - tensor_guid_t const &tensor_guid, - TensorShape const &tensor_shape, - Allocator &allocator) { - if (!contains_key(local_tensor_backing.tensor_lowering_mapping, - tensor_guid)) { - lowered_tensor_t lowered_tensor = - local_tensor_backing.allocate_tensor(tensor_shape, allocator); - local_tensor_backing.tensor_lowering_mapping.insert( - {tensor_guid, lowered_tensor}); - } -} - -void allocate_gradient_tensor(LocalTensorBacking &local_tensor_backing, - gradient_tensor_t const &gradient_tensor, - tensor_guid_t const &tensor_guid, - TensorShape const &tensor_shape, - Allocator &allocator) { - if (!contains_key(local_tensor_backing.tensor_gradient_mapping, - tensor_guid)) { - local_tensor_backing.tensor_gradient_mapping.insert( - {tensor_guid, gradient_tensor}); - lowered_tensor_t lowered_tensor = - local_tensor_backing.allocate_tensor(tensor_shape, allocator); - local_tensor_backing.gradient_tensor_lowering_mapping.insert( - {gradient_tensor, lowered_tensor}); - } -} - -void allocate_optimizer_tensors( - LocalTensorBacking &local_tensor_backing, - std::vector const &optimizer_tensors, - tensor_guid_t const &tensor_guid, - TensorShape const &tensor_shape, - Allocator &allocator) { - if (!contains_key(local_tensor_backing.tensor_optimizer_mapping, - tensor_guid)) { - // insert new optimizer tensors into mappings - std::vector optimizer_tensors; - for (optimizer_tensor_t const &optimizer_tensor : optimizer_tensors) { - // allocate lowered tensor - lowered_tensor_t lowered_tensor = - local_tensor_backing.allocate_tensor(tensor_shape, allocator); - local_tensor_backing.optimizer_tensor_lowering_mapping.insert( - {optimizer_tensor, lowered_tensor}); - } - local_tensor_backing.tensor_optimizer_mapping.insert( - {tensor_guid, optimizer_tensors}); - } -} +UnallocatedTensors + generate_unallocated_tensors(AllocatedTensors const &allocated_tensors, + ComputationGraph const &computation_graph, + GradientTensorSource &gradient_tensor_source) { + assert(are_allocated_forward_tensors_valid(allocated_tensors, + computation_graph)); + assert(are_allocated_gradient_tensors_valid(allocated_tensors, + computation_graph)); -void allocate_loss_tensor(LocalTensorBacking &local_tensor_backing, - loss_tensor_t const &loss_tensor, - TensorShape const &tensor_shape, - Allocator &allocator) { - lowered_tensor_t lowered_tensor = - local_tensor_backing.allocate_tensor(tensor_shape, allocator); - local_tensor_backing.loss_tensor_lowering_mapping.insert( - {loss_tensor, lowered_tensor}); -} + std::unordered_map tensor_type_shapes; + std::unordered_map gradient_mapping; -void allocate_all_computation_graph_tensors( - LocalTensorBacking &local_tensor_backing, - GradientTensorSource &gradient_tensor_source, - ComputationGraph const &computation_graph, - Allocator &allocator) { - // allocate each layer's tensors and gradient tensors for (tensor_guid_t const &tensor_guid : get_all_tensors(computation_graph)) { TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor_guid); - allocate_tensor_guid( - local_tensor_backing, tensor_guid, tensor_attrs.shape, allocator); + TensorTypeVariant tensor_guid_type = TensorTypeVariant{tensor_guid}; + if (!allocated_tensors.tensor_type_backings.count(tensor_guid_type)) { + tensor_type_shapes.insert({tensor_guid_type, tensor_attrs.shape}); + } - if (tensor_attrs.create_gradients == CreateGrad::YES) { + if (tensor_attrs.create_gradients == CreateGrad::YES && + !allocated_tensors.gradient_mapping.count(tensor_guid)) { gradient_tensor_t gradient_tensor = gradient_tensor_source.new_gradient_tensor(); - allocate_gradient_tensor(local_tensor_backing, - gradient_tensor, - tensor_guid, - tensor_attrs.shape, - allocator); + tensor_type_shapes.insert( + {TensorTypeVariant{tensor_guid}, tensor_attrs.shape}); + gradient_mapping.insert({tensor_guid, gradient_tensor}); } } + + return UnallocatedTensors{tensor_type_shapes, gradient_mapping, {}}; } -void allocate_all_optimizer_tensors( - LocalTensorBacking &local_tensor_backing, - OptimizerTensorSource &optimizer_tensor_source, +UnallocatedTensors generate_unallocated_tensors_with_optimizer( + AllocatedTensors const &allocated_tensors, ComputationGraph const &computation_graph, - Allocator &allocator, + GradientTensorSource &gradient_tensor_source, + OptimizerTensorSource &optimizer_tensor_source, OptimizerAttrs const &optimizer_attrs) { + + UnallocatedTensors unallocated_tensors = generate_unallocated_tensors( + allocated_tensors, computation_graph, gradient_tensor_source); + assert(are_allocated_optimizer_tensors_valid(allocated_tensors, + computation_graph)); + + std::unordered_map tensor_type_shapes = + unallocated_tensors.tensor_type_shapes; + std::unordered_map gradient_mapping = + unallocated_tensors.gradient_mapping; + std::unordered_map> + optimizer_mapping; + for (tensor_guid_t const &tensor_guid : get_all_tensors(computation_graph)) { TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor_guid); - if (tensor_attrs.create_gradients == CreateGrad::YES) { + if (tensor_attrs.create_gradients == CreateGrad::YES && + !allocated_tensors.optimizer_mapping.count(tensor_guid)) { std::vector optimizer_tensors; - for (int i = 0; i < get_num_optimizer_tensors(optimizer_attrs); ++i) { - optimizer_tensors.push_back( - optimizer_tensor_source.new_optimizer_tensor()); + + int num_optimizer_tensors_to_allocate = + get_num_optimizer_tensors(optimizer_attrs); + if (allocated_tensors.optimizer_mapping.count(tensor_guid)) { + num_optimizer_tensors_to_allocate -= + allocated_tensors.optimizer_mapping.at(tensor_guid).size(); + } + + for (int i = 0; i < num_optimizer_tensors_to_allocate; ++i) { + optimizer_tensor_t optimizer_tensor = + optimizer_tensor_source.new_optimizer_tensor(); + optimizer_tensors.push_back(optimizer_tensor); + tensor_type_shapes.insert( + {TensorTypeVariant{optimizer_tensor}, tensor_attrs.shape}); } - allocate_optimizer_tensors(local_tensor_backing, - optimizer_tensors, - tensor_guid, - tensor_attrs.shape, - allocator); + optimizer_mapping.insert({tensor_guid, optimizer_tensors}); } } -} -loss_tensor_t allocate_loss_tensor(LocalTensorBacking &local_tensor_backing, - LossTensorSource &loss_tensor_source, - TensorShape const &tensor_shape, - Allocator &allocator) { - loss_tensor_t loss_tensor = loss_tensor_source.new_loss_tensor(); - lowered_tensor_t lowered_tensor = - local_tensor_backing.allocate_tensor(tensor_shape, allocator); - local_tensor_backing.loss_tensor_lowering_mapping.insert( - {loss_tensor, lowered_tensor}); - return loss_tensor; + return UnallocatedTensors{ + tensor_type_shapes, gradient_mapping, optimizer_mapping}; } TensorSlotsBacking construct_tensor_slots_backing( @@ -207,30 +190,10 @@ TensorSlotsBacking construct_tensor_slots_backing( TaskBinding const &binding) { TensorSlotsBacking mapping; - for (auto const &tensor_binding : binding.get_tensor_bindings()) { - SlotTensorTypeId slot_tensor_type_id = tensor_binding.first; - - lowered_tensor_t lowered_tensor = - tensor_binding.second.visit(overload{ - [&](tensor_guid_t const &t) { - return local_tensor_backing.tensor_lowering_mapping.at(t); - }, - [&](gradient_tensor_t const &t) { - return local_tensor_backing.gradient_tensor_lowering_mapping.at( - t); - }, - [&](optimizer_tensor_t const &t) { - return local_tensor_backing.optimizer_tensor_lowering_mapping.at( - t); - }, - [&](loss_tensor_t const &t) { - return local_tensor_backing.loss_tensor_lowering_mapping.at(t); - }, - }); - - GenericTensorAccessorW accessor = - local_tensor_backing.tensor_backings.at(lowered_tensor); - mapping.insert({slot_tensor_type_id, accessor}); + for (std::pair const &tensor_binding : + binding.get_tensor_bindings()) { + mapping.insert({tensor_binding.first, + local_tensor_backing.get_tensor(tensor_binding.second)}); } return mapping; diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index 2679a502e3..8a0dc825eb 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -2,11 +2,10 @@ #include "local-execution/loss_functions.h" #include "local-execution/optimizer.h" #include "local-execution/task_signature_impl.h" -#include "task-spec/op_task_to_task_invocation.h" -#include "task-spec/task_invocation.h" - #include "pcg/computation_graph.h" #include "pcg/optimizer_attrs.h" +#include "task-spec/op_task_to_task_invocation.h" +#include "task-spec/task_invocation.h" #include "utils/containers/contains.h" #include "utils/containers/contains_key.h" #include "utils/containers/get_only.h" @@ -17,70 +16,93 @@ namespace FlexFlow { LocalTrainingBacking::LocalTrainingBacking( Allocator const &allocator, + AllocatedTensors const &allocated_tensors, ComputationGraph const &computation_graph, - LocalTensorBacking const &local_tensor_backing, - LocalArgsBacking const &local_args_backing) - : allocator(allocator), computation_graph(computation_graph), - task_registry(empty_task_registry()), - local_tensor_backing(local_tensor_backing), - local_args_backing(local_args_backing) { - allocate_all_computation_graph_tensors(this->local_tensor_backing, - this->gradient_tensor_source, - this->computation_graph, - this->allocator); - register_all_computation_graph_tasks(this->task_registry, - this->computation_graph); -} + RuntimeArgConfig const &runtime_arg_config) + : computation_graph(computation_graph), + task_registry(construct_task_registry(computation_graph)), + local_tensor_backing( + allocated_tensors, + generate_unallocated_tensors(allocated_tensors, + computation_graph, + this->gradient_tensor_source), + allocator), + local_args_backing(initialize_args_backing(this->task_registry, + this->computation_graph, + runtime_arg_config, + this->local_tensor_backing)){}; -DeviceSpecificDeviceStates - call_init_task_impl(TaskRegistry const &task_registry, - task_id_t task_id, - TaskArgumentAccessor const &acc) { - TaskSignatureAndImpl task_sig_impl = task_registry.task_mapping.at(task_id); - auto fn = - task_sig_impl.impl_function.get().function_ptr; - return fn(acc); +LocalTrainingBacking::LocalTrainingBacking( + Allocator const &allocator, + AllocatedTensors const &allocated_tensors, + ComputationGraph const &computation_graph, + RuntimeArgConfig const &runtime_arg_config, + OptimizerAttrs const &optimizer_attrs) + : computation_graph(computation_graph), + task_registry(construct_task_registry(computation_graph)), + local_tensor_backing(allocated_tensors, + generate_unallocated_tensors_with_optimizer( + allocated_tensors, + computation_graph, + this->gradient_tensor_source, + this->optimizer_tensor_source, + optimizer_attrs), + allocator), + local_args_backing(initialize_args_backing(this->task_registry, + this->computation_graph, + runtime_arg_config, + this->local_tensor_backing)){}; + +LocalArgsBacking + initialize_args_backing(TaskRegistry const &task_registry, + ComputationGraph const &cg, + RuntimeArgConfig const &runtime_arg_config, + LocalTensorBacking const &local_tensor_backing) { + std::unordered_map + per_device_op_states; + for (layer_guid_t const &node : topological_ordering(cg)) { + if (registry_contains_task_for_layer( + task_registry, node, OpTaskType::INIT)) { + ComputationGraphOpAttrs attrs = get_layer_attrs(cg, node).attrs; + + TaskInvocation invocation = + lower_to_task_invocation(init(attrs), + node, + get_incoming_inputs(cg, node), + get_incoming_input_shapes(cg, node), + get_outgoing_tensors(cg, node), + get_incoming_weights(cg, node), + local_tensor_backing.tensor_gradient_mapping, + std::nullopt); + TaskArgumentAccessor accessor = get_task_arg_accessor( + local_tensor_backing, + make_args_backing_with_empty_device_states(runtime_arg_config), + invocation); + TaskSignatureAndImpl task_sig_impl = + task_registry.task_mapping.at(invocation.task_id); + auto fn = task_sig_impl.impl_function.get() + .function_ptr; + DeviceSpecificDeviceStates device_state = fn(accessor); + per_device_op_states.insert({node, device_state}); + } + } + + return LocalArgsBacking{runtime_arg_config, per_device_op_states}; } std::optional call_task_impl(TaskRegistry const &task_registry, - task_id_t task_id, - TaskArgumentAccessor acc) { + task_id_t const &task_id, + TaskArgumentAccessor const &acc) { TaskSignatureAndImpl task_sig_impl = task_registry.task_mapping.at(task_id); auto fn = task_sig_impl.impl_function.get().function_ptr; return fn(acc); } -void execute_init(LocalTrainingBacking &local_training_backing, - layer_guid_t const &operator_node) { - if (registry_contains_task_for_layer(local_training_backing.task_registry, - operator_node, - OpTaskType::INIT)) { - ComputationGraphOpAttrs attrs = - get_layer_attrs(local_training_backing.computation_graph, operator_node) - .attrs; - - TaskInvocation invocation = lower_to_task_invocation( - init(attrs), - operator_node, - local_training_backing.computation_graph, - local_training_backing.local_tensor_backing.tensor_gradient_mapping, - std::nullopt); - TaskArgumentAccessor accessor = - get_task_arg_accessor(local_training_backing.local_tensor_backing, - local_training_backing.local_args_backing, - invocation, - local_training_backing.allocator); - DeviceSpecificDeviceStates device_state = call_init_task_impl( - local_training_backing.task_registry, invocation.task_id, accessor); - add_per_device_op_state( - local_training_backing.local_args_backing, operator_node, device_state); - } -} - std::optional - execute_forward(LocalTrainingBacking &local_training_backing, - layer_guid_t const &operator_node) { + execute_forward(LocalTrainingBacking const &local_training_backing, + layer_guid_t const &operator_node, + Allocator &allocator) { if (registry_contains_task_for_layer(local_training_backing.task_registry, operator_node, OpTaskType::FWD)) { @@ -94,14 +116,20 @@ std::optional TaskInvocation invocation = lower_to_task_invocation( forward(attrs), operator_node, - local_training_backing.computation_graph, + get_incoming_inputs(local_training_backing.computation_graph, + operator_node), + get_incoming_input_shapes(local_training_backing.computation_graph, + operator_node), + get_outgoing_tensors(local_training_backing.computation_graph, + operator_node), + get_incoming_weights(local_training_backing.computation_graph, + operator_node), local_training_backing.local_tensor_backing.tensor_gradient_mapping, device_state); TaskArgumentAccessor accessor = get_task_arg_accessor(local_training_backing.local_tensor_backing, local_training_backing.local_args_backing, - invocation, - local_training_backing.allocator); + invocation); return call_task_impl( local_training_backing.task_registry, invocation.task_id, accessor); } else { @@ -109,7 +137,7 @@ std::optional } } -void compute_loss(LocalTrainingBacking &local_training_backing, +void compute_loss(LocalTrainingBacking const &local_training_backing, LossAttrs const &loss_attrs, tensor_guid_t const &logit_tensor, loss_tensor_t const &label_tensor) { @@ -124,14 +152,13 @@ void compute_loss(LocalTrainingBacking &local_training_backing, TaskArgumentAccessor loss_accessor = get_task_arg_accessor(local_training_backing.local_tensor_backing, local_training_backing.local_args_backing, - loss_invocation, - local_training_backing.allocator); + loss_invocation); TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl(); loss_impl_fn.get().function_ptr(loss_accessor); } std::optional - execute_backward(LocalTrainingBacking &local_training_backing, + execute_backward(LocalTrainingBacking const &local_training_backing, layer_guid_t const &operator_node) { if (registry_contains_task_for_layer(local_training_backing.task_registry, operator_node, @@ -146,14 +173,20 @@ std::optional TaskInvocation invocation = lower_to_task_invocation( backward(attrs), operator_node, - local_training_backing.computation_graph, + get_incoming_inputs(local_training_backing.computation_graph, + operator_node), + get_incoming_input_shapes(local_training_backing.computation_graph, + operator_node), + get_outgoing_tensors(local_training_backing.computation_graph, + operator_node), + get_incoming_weights(local_training_backing.computation_graph, + operator_node), local_training_backing.local_tensor_backing.tensor_gradient_mapping, device_state); TaskArgumentAccessor accessor = get_task_arg_accessor(local_training_backing.local_tensor_backing, local_training_backing.local_args_backing, - invocation, - local_training_backing.allocator); + invocation); return call_task_impl( local_training_backing.task_registry, invocation.task_id, accessor); } else { @@ -161,7 +194,7 @@ std::optional } } -void execute_update(LocalTrainingBacking &local_training_backing, +void execute_update(LocalTrainingBacking const &local_training_backing, layer_guid_t const &node, OptimizerAttrs const &optimizer_attrs) { LayerAttrs layer_attrs = @@ -191,8 +224,7 @@ void execute_update(LocalTrainingBacking &local_training_backing, TaskArgumentAccessor accessor = get_task_arg_accessor(local_training_backing.local_tensor_backing, local_training_backing.local_args_backing, - invocation, - local_training_backing.allocator); + invocation); TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs); update_impl_fn.get().function_ptr(accessor); } @@ -201,14 +233,13 @@ void execute_update(LocalTrainingBacking &local_training_backing, TaskArgumentAccessor get_task_arg_accessor(LocalTensorBacking const &local_tensor_backing, LocalArgsBacking const &local_args_backing, - TaskInvocation const &invocation, - Allocator &allocator) { + TaskInvocation const &invocation) { TensorSlotsBacking tensor_slots_backing = construct_tensor_slots_backing(local_tensor_backing, invocation.binding); ArgSlotsBacking arg_slots_backing = construct_arg_slots_backing( invocation.binding, local_args_backing.runtime_arg_config); return TaskArgumentAccessor::create( - allocator, tensor_slots_backing, arg_slots_backing); + local_tensor_backing.allocator, tensor_slots_backing, arg_slots_backing); } } // namespace FlexFlow diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc index 98b8851212..4a22937174 100644 --- a/lib/local-execution/src/model_training_instance.cc +++ b/lib/local-execution/src/model_training_instance.cc @@ -8,31 +8,15 @@ namespace FlexFlow { ModelTrainingInstance::ModelTrainingInstance( LocalTrainingBacking const &local_training_backing, tensor_guid_t const &logit_tensor, - TensorShape const &label_tensor_shape, + loss_tensor_t const &label_tensor, LossAttrs const &loss_attrs, OptimizerAttrs const &optimizer_attrs) : training_backing(local_training_backing), loss_attrs(loss_attrs), optimizer_attrs(optimizer_attrs), logit_tensor(logit_tensor), - label_tensor( - allocate_loss_tensor(this->training_backing.local_tensor_backing, - this->loss_tensor_source, - label_tensor_shape, - this->training_backing.allocator)) { - allocate_all_optimizer_tensors(this->training_backing.local_tensor_backing, - this->optimizer_tensor_source, - this->training_backing.computation_graph, - this->training_backing.allocator, - this->optimizer_attrs); -} - -void init(ModelTrainingInstance &model_training_instance) { - for (layer_guid_t const &node : topological_ordering( - model_training_instance.training_backing.computation_graph)) { - execute_init(model_training_instance.training_backing, node); - } -} + label_tensor(label_tensor){}; -PerLayerElapsedTime forward(ModelTrainingInstance &model_training_instance) { +PerLayerElapsedTime + forward(ModelTrainingInstance const &model_training_instance) { PerLayerElapsedTime per_layer_elapsed_time; for (layer_guid_t const &node : topological_ordering( model_training_instance.training_backing.computation_graph)) { @@ -43,7 +27,8 @@ PerLayerElapsedTime forward(ModelTrainingInstance &model_training_instance) { return per_layer_elapsed_time; } -PerLayerElapsedTime backward(ModelTrainingInstance &model_training_instance) { +PerLayerElapsedTime + backward(ModelTrainingInstance const &model_training_instance) { compute_loss(model_training_instance.training_backing, model_training_instance.loss_attrs, model_training_instance.logit_tensor, diff --git a/lib/local-execution/src/task_registry.cc b/lib/local-execution/src/task_registry.cc index 7b0c80a9bc..f33aef8460 100644 --- a/lib/local-execution/src/task_registry.cc +++ b/lib/local-execution/src/task_registry.cc @@ -4,44 +4,49 @@ namespace FlexFlow { -TaskRegistry empty_task_registry() { - return TaskRegistry{{}, {}, {}, {}}; -} +TaskRegistry construct_task_registry(ComputationGraph const &cg) { + std::unordered_map> init_task_ids; + std::unordered_map> fwd_task_ids; + std::unordered_map> bwd_task_ids; + + std::unordered_map task_mapping; + + for (layer_guid_t const &node : topological_ordering(cg)) { + init_task_ids.insert({node, std::nullopt}); + fwd_task_ids.insert({node, std::nullopt}); + bwd_task_ids.insert({node, std::nullopt}); -void register_tasks_for_layer(TaskRegistry &task_registry, - layer_guid_t const &op_id, - ComputationGraphOpAttrs const &attrs) { - task_registry.init_task_ids.insert({op_id, std::nullopt}); - task_registry.forward_task_ids.insert({op_id, std::nullopt}); - task_registry.backward_task_ids.insert({op_id, std::nullopt}); + ComputationGraphOpAttrs attrs = get_layer_attrs(cg, node).attrs; + std::vector task_ids = get_task_ids(attrs); - // register tasks - std::vector task_ids = get_task_ids(attrs); - for (task_id_t task_id : task_ids) { - TaskSignatureAndImpl task_signature_impl = get_task_sig_impl(task_id); - switch (task_signature_impl.task_signature.type) { - case OpTaskType::INIT: - assert(is_invocation_valid(task_signature_impl.task_signature, - init(attrs))); - task_registry.init_task_ids[op_id] = task_id; - break; - case OpTaskType::FWD: - assert(is_invocation_valid(task_signature_impl.task_signature, - forward(attrs))); - task_registry.forward_task_ids[op_id] = task_id; - break; - case OpTaskType::BWD: - assert(is_invocation_valid(task_signature_impl.task_signature, - backward(attrs))); - task_registry.backward_task_ids[op_id] = task_id; - break; - default: - throw mk_runtime_error( - fmt::format("Invalid OpTaskType, got {}", - task_signature_impl.task_signature.type)); + for (task_id_t const &task_id : task_ids) { + TaskSignatureAndImpl task_signature_impl = get_task_sig_impl(task_id); + switch (task_signature_impl.task_signature.type) { + case OpTaskType::INIT: + assert(is_invocation_valid(task_signature_impl.task_signature, + init(attrs))); + init_task_ids[node] = task_id; + break; + case OpTaskType::FWD: + assert(is_invocation_valid(task_signature_impl.task_signature, + init(attrs))); + fwd_task_ids[node] = task_id; + break; + case OpTaskType::BWD: + assert(is_invocation_valid(task_signature_impl.task_signature, + init(attrs))); + fwd_task_ids[node] = task_id; + break; + default: + throw mk_runtime_error( + fmt::format("Invalid OpTaskType, got {}", + task_signature_impl.task_signature.type)); + } + task_mapping.insert({task_id, task_signature_impl}); } - task_registry.task_mapping.insert({task_id, task_signature_impl}); } + + return TaskRegistry{init_task_ids, fwd_task_ids, bwd_task_ids, task_mapping}; } bool registry_contains_task_for_layer(TaskRegistry const &task_registry, @@ -66,12 +71,4 @@ bool registry_contains_task_for_layer(TaskRegistry const &task_registry, return task_ids.at(op).has_value(); } -void register_all_computation_graph_tasks(TaskRegistry ®istry, - ComputationGraph const &cg) { - for (layer_guid_t const &node : topological_ordering(cg)) { - ComputationGraphOpAttrs attrs = get_layer_attrs(cg, node).attrs; - register_tasks_for_layer(registry, node, attrs); - } -} - } // namespace FlexFlow diff --git a/lib/pcg/include/pcg/computation_graph.h b/lib/pcg/include/pcg/computation_graph.h index 9d4d8c85c1..c473ae1f40 100644 --- a/lib/pcg/include/pcg/computation_graph.h +++ b/lib/pcg/include/pcg/computation_graph.h @@ -34,6 +34,10 @@ std::vector get_incoming_tensors(ComputationGraph const &cg, std::vector get_incoming_inputs(ComputationGraph const &, layer_guid_t const &); + +std::vector get_incoming_input_shapes(ComputationGraph const &, + layer_guid_t const &); + std::vector get_incoming_weights(ComputationGraph const &, layer_guid_t const &); diff --git a/lib/pcg/src/pcg/computation_graph.cc b/lib/pcg/src/pcg/computation_graph.cc index 312488bdf5..74448f18bc 100644 --- a/lib/pcg/src/pcg/computation_graph.cc +++ b/lib/pcg/src/pcg/computation_graph.cc @@ -84,6 +84,13 @@ std::vector get_incoming_tensors(ComputationGraph const &cg, [](DataflowOutput const &o) { return tensor_guid_t{o}; }); } +std::vector get_incoming_input_shapes(ComputationGraph const &cg, + layer_guid_t n) { + return transform(get_incoming_inputs(cg, n), [&](tensor_guid_t const &t) { + return get_tensor_attrs(cg, t).shape; + }); +} + static std::vector get_incoming_tensors_with_role(ComputationGraph const &cg, layer_guid_t const &l, diff --git a/lib/task-spec/include/task-spec/op_task_to_task_invocation.h b/lib/task-spec/include/task-spec/op_task_to_task_invocation.h index 0c5fdb39a4..68c7f05d77 100644 --- a/lib/task-spec/include/task-spec/op_task_to_task_invocation.h +++ b/lib/task-spec/include/task-spec/op_task_to_task_invocation.h @@ -13,7 +13,10 @@ namespace FlexFlow { TaskInvocation lower_to_task_invocation( OpTaskInvocation const &, layer_guid_t const &, - ComputationGraph const &, + std::vector const &input_tensors, + std::vector const &input_tensor_shapes, + std::vector const &output_tensors, + std::vector const &weight_tensors, std::unordered_map const &, std::optional const &); @@ -22,7 +25,7 @@ ConcreteArgSpec lower_to_concrete_arg_spec(RuntimeArgRefSpec const &, ConcreteArgSpec lower_to_concrete_arg_spec( OpArgRefSpec const &, - ComputationGraph const &, + std::vector const &, layer_guid_t const &, std::optional const &); diff --git a/lib/task-spec/src/op_task_to_task_invocation.cc b/lib/task-spec/src/op_task_to_task_invocation.cc index f52800a8de..515d1dc1dc 100644 --- a/lib/task-spec/src/op_task_to_task_invocation.cc +++ b/lib/task-spec/src/op_task_to_task_invocation.cc @@ -7,18 +7,14 @@ namespace FlexFlow { TaskInvocation lower_to_task_invocation( OpTaskInvocation const &op_task_invocation, layer_guid_t const &layer_guid, - ComputationGraph const &computation_graph, + std::vector const &input_tensors, + std::vector const &input_tensor_shapes, + std::vector const &output_tensors, + std::vector const &weight_tensors, std::unordered_map const &tensor_gradient_mapping, std::optional const &device_states) { TaskBinding binding; - // tensors - std::vector input_tensors = - get_incoming_inputs(computation_graph, layer_guid); - std::vector output_tensors = - get_outgoing_tensors(computation_graph, layer_guid); - std::vector weight_tensors = - get_incoming_weights(computation_graph, layer_guid); for (auto const &tensor_binding : op_task_invocation.binding.get_tensor_bindings()) { @@ -56,7 +52,7 @@ TaskInvocation lower_to_task_invocation( if (arg_binding.second.has()) { ConcreteArgSpec concrete_arg = lower_to_concrete_arg_spec(arg_binding.second.get(), - computation_graph, + input_tensor_shapes, layer_guid, device_states); binding.insert_arg_spec(arg_binding.first, TaskArgSpec{concrete_arg}); @@ -76,7 +72,7 @@ TaskInvocation lower_to_task_invocation( ConcreteArgSpec lower_to_concrete_arg_spec( OpArgRefSpec const &op_arg_ref_spec, - ComputationGraph const &cg, + std::vector const &input_tensor_shapes, layer_guid_t const &op_guid, std::optional const &device_states) { if (op_arg_ref_spec.holds()) { @@ -86,10 +82,9 @@ ConcreteArgSpec lower_to_concrete_arg_spec( } else if (op_arg_ref_spec.holds()) { ParallelTensorShapeRefType index_op_arg_ref = op_arg_ref_spec.get_ref_type().get(); - tensor_guid_t input_tensor = - get_incoming_inputs(cg, op_guid).at(index_op_arg_ref.idx); - TensorAttrs tensor_attrs = get_tensor_attrs(cg, input_tensor); - ParallelTensorShape shape = lift_to_parallel(tensor_attrs.shape); + TensorShape input_tensor_shape = + input_tensor_shapes.at(index_op_arg_ref.idx); + ParallelTensorShape shape = lift_to_parallel(input_tensor_shape); return ConcreteArgSpec::create(shape); } else { throw mk_runtime_error("Unhandled op arg ref type"); From 187a8d53a1bb5d62ba2f5039ff58a32c0a5a2187 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Thu, 13 Feb 2025 12:21:07 -0800 Subject: [PATCH 41/91] Add tests for allocated and unallocated --- lib/kernels/include/kernels/array_shape.h | 7 + lib/kernels/src/array_shape.cc | 11 + .../local-execution/allocated_tensors.h | 17 +- .../allocated_tensors.struct.toml | 5 +- .../local-execution/local_tensor_backing.h | 4 +- .../unallocated_tensors.struct.toml | 5 +- lib/local-execution/src/allocated_tensors.cc | 109 ++--- .../src/local_cost_estimator.cc | 10 +- .../src/local_tensor_backing.cc | 43 +- .../src/local_training_backing.cc | 4 +- lib/local-execution/test/CMakeLists.txt | 4 +- .../test/src/test_allocated_tensors.cc | 221 ++++++++++ .../test/src/test_unallocated_tensors.cc | 383 ++++++++++++++++++ lib/local-execution/test/src/test_utils.cc | 10 + lib/local-execution/test/src/test_utils.h | 10 + lib/pcg/include/pcg/computation_graph.h | 2 + lib/pcg/src/pcg/computation_graph.cc | 10 + lib/utils/include/utils/required_core.h | 2 +- 18 files changed, 766 insertions(+), 91 deletions(-) create mode 100644 lib/local-execution/test/src/test_allocated_tensors.cc create mode 100644 lib/local-execution/test/src/test_unallocated_tensors.cc diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h index 95d20ceca3..7e14bf41ad 100644 --- a/lib/kernels/include/kernels/array_shape.h +++ b/lib/kernels/include/kernels/array_shape.h @@ -69,4 +69,11 @@ std::ostream &operator<<(std::ostream &, ArrayShape const &); } // namespace FlexFlow +namespace std { +template <> +struct hash<::FlexFlow::ArrayShape> { + size_t operator()(::FlexFlow::ArrayShape const &) const; +}; +} // namespace std + #endif diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index 220f8ebeea..521b15e435 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -128,3 +128,14 @@ std::ostream &operator<<(std::ostream &s, ArrayShape const &x) { } } // namespace FlexFlow + +namespace std { +size_t hash::operator()( + ::FlexFlow::ArrayShape const &x) const { + size_t result = 0; + result ^= std::hash<::FlexFlow::LegionOrdered<::FlexFlow::nonnegative_int>>{}( + x.dims) + + 0x9e3779b9 + (result << 6) + (result >> 2); + return result; +} +} // namespace std diff --git a/lib/local-execution/include/local-execution/allocated_tensors.h b/lib/local-execution/include/local-execution/allocated_tensors.h index 60ee662ba8..0d01350d9f 100644 --- a/lib/local-execution/include/local-execution/allocated_tensors.h +++ b/lib/local-execution/include/local-execution/allocated_tensors.h @@ -6,12 +6,17 @@ namespace FlexFlow { -bool are_allocated_forward_tensors_valid(AllocatedTensors const &, - ComputationGraph const &); -bool are_allocated_gradient_tensors_valid(AllocatedTensors const &, - ComputationGraph const &); -bool are_allocated_optimizer_tensors_valid(AllocatedTensors const &, - ComputationGraph const &); +bool are_allocated_forward_tensors_valid( + AllocatedTensors const &, + std::unordered_map const &); +bool are_allocated_gradient_tensors_valid( + AllocatedTensors const &, + std::unordered_map const &); +bool are_allocated_optimizer_tensors_valid( + AllocatedTensors const &, + std::unordered_map const &); + +bool are_allocated_tensors_valid(AllocatedTensors const &, std::unordered_map const &); bool is_allocated_tensor_backing_valid( TensorTypeVariant const &, diff --git a/lib/local-execution/include/local-execution/allocated_tensors.struct.toml b/lib/local-execution/include/local-execution/allocated_tensors.struct.toml index e4be709709..09245097b4 100644 --- a/lib/local-execution/include/local-execution/allocated_tensors.struct.toml +++ b/lib/local-execution/include/local-execution/allocated_tensors.struct.toml @@ -4,7 +4,6 @@ features = [ "eq", "fmt", "hash", - "ord" ] includes = [ @@ -14,7 +13,9 @@ includes = [ src_includes = [ "utils/hash/unordered_map.h", - "utils/fmt/unordered_map.h" + "utils/fmt/unordered_map.h", + "utils/hash/vector.h", + "utils/fmt/vector.h" ] [[fields]] diff --git a/lib/local-execution/include/local-execution/local_tensor_backing.h b/lib/local-execution/include/local-execution/local_tensor_backing.h index 86244eab13..a43f1a2c81 100644 --- a/lib/local-execution/include/local-execution/local_tensor_backing.h +++ b/lib/local-execution/include/local-execution/local_tensor_backing.h @@ -54,12 +54,12 @@ struct LocalTensorBacking { }; UnallocatedTensors generate_unallocated_tensors(AllocatedTensors const &, - ComputationGraph const &, + std::unordered_map const &, GradientTensorSource &); UnallocatedTensors generate_unallocated_tensors_with_optimizer(AllocatedTensors const &, - ComputationGraph const &, + std::unordered_map const &, GradientTensorSource &, OptimizerTensorSource &, OptimizerAttrs const &); diff --git a/lib/local-execution/include/local-execution/unallocated_tensors.struct.toml b/lib/local-execution/include/local-execution/unallocated_tensors.struct.toml index 87abf83d13..e86cc2a532 100644 --- a/lib/local-execution/include/local-execution/unallocated_tensors.struct.toml +++ b/lib/local-execution/include/local-execution/unallocated_tensors.struct.toml @@ -4,7 +4,6 @@ features = [ "eq", "fmt", "hash", - "ord" ] includes = [ @@ -14,7 +13,9 @@ includes = [ src_includes = [ "utils/hash/unordered_map.h", - "utils/fmt/unordered_map.h" + "utils/fmt/unordered_map.h", + "utils/hash/vector.h", + "utils/fmt/vector.h" ] [[fields]] diff --git a/lib/local-execution/src/allocated_tensors.cc b/lib/local-execution/src/allocated_tensors.cc index e64db0cfff..19b149e7bd 100644 --- a/lib/local-execution/src/allocated_tensors.cc +++ b/lib/local-execution/src/allocated_tensors.cc @@ -22,17 +22,26 @@ bool is_allocated_tensor_backing_valid( bool are_allocated_forward_tensors_valid( AllocatedTensors const &allocated_tensors, - ComputationGraph const &computation_graph) { + std::unordered_map const &tensor_attrs) { + std::unordered_set all_tensor_guids = - set_union(keys(allocated_tensors.gradient_mapping), - keys(allocated_tensors.optimizer_mapping)); + transform( + keys(filter_keys(allocated_tensors.tensor_type_backings, + [&](TensorTypeVariant const &k) { + return k.has(); + })), + [&](TensorTypeVariant const &t) { return t.get(); } + ); + for (tensor_guid_t const &tensor_guid : all_tensor_guids) { - TensorAttrs expected_tensor_attrs = - get_tensor_attrs(computation_graph, tensor_guid); - if (!is_allocated_tensor_backing_valid( - TensorTypeVariant{tensor_guid}, - allocated_tensors.tensor_type_backings, - ArrayShape{expected_tensor_attrs.shape})) { + if (tensor_attrs.count(tensor_guid)) { + if (!is_allocated_tensor_backing_valid( + TensorTypeVariant{tensor_guid}, + allocated_tensors.tensor_type_backings, + ArrayShape{tensor_attrs.at(tensor_guid).shape})) { + return false; + } + } else { return false; } } @@ -41,30 +50,29 @@ bool are_allocated_forward_tensors_valid( bool are_allocated_gradient_tensors_valid( AllocatedTensors const &allocated_tensors, - ComputationGraph const &computation_graph) { + std::unordered_map const &tensor_attrs) { std::unordered_set - tensors_in_mappings; // will check whether any dangling gradient tensors - // were allocated + tensors_in_mappings; // will check for dangling gradient tensors for (std::pair const &tensor_to_grad : allocated_tensors.gradient_mapping) { - TensorAttrs expected_tensor_attrs = - get_tensor_attrs(computation_graph, tensor_to_grad.first); - if (expected_tensor_attrs.create_gradients == CreateGrad::NO) { - return false; - } + if (tensor_attrs.count(tensor_to_grad.first)) { + if (tensor_attrs.at(tensor_to_grad.first).create_gradients == + CreateGrad::NO) { + return false; + } - ArrayShape tensor_guid_array_shape = - allocated_tensors.tensor_type_backings - .at(TensorTypeVariant{tensor_to_grad.first}) - .shape; - TensorTypeVariant gradient_tensor = - TensorTypeVariant{tensor_to_grad.second}; - if (is_allocated_tensor_backing_valid( - gradient_tensor, - allocated_tensors.tensor_type_backings, - tensor_guid_array_shape)) { - tensors_in_mappings.insert(gradient_tensor); + ArrayShape tensor_guid_array_shape = ArrayShape{tensor_attrs.at(tensor_to_grad.first).shape}; + TensorTypeVariant gradient_tensor = + TensorTypeVariant{tensor_to_grad.second}; + if (is_allocated_tensor_backing_valid( + gradient_tensor, + allocated_tensors.tensor_type_backings, + tensor_guid_array_shape)) { + tensors_in_mappings.insert(gradient_tensor); + } else { + return false; + } } else { return false; } @@ -83,33 +91,30 @@ bool are_allocated_gradient_tensors_valid( bool are_allocated_optimizer_tensors_valid( AllocatedTensors const &allocated_tensors, - ComputationGraph const &computation_graph) { + std::unordered_map const &tensor_attrs) { std::unordered_set - tensors_in_mappings; // will check whether any dangling optimizer tensors - // were allocated + tensors_in_mappings; // will check for dangling optimizer tensors for (std::pair> const &tensor_to_optimizers : allocated_tensors.optimizer_mapping) { - TensorAttrs expected_tensor_attrs = - get_tensor_attrs(computation_graph, tensor_to_optimizers.first); - if (expected_tensor_attrs.create_gradients == CreateGrad::NO) { - return false; - } - - ArrayShape tensor_guid_array_shape = - allocated_tensors.tensor_type_backings - .at(TensorTypeVariant{tensor_to_optimizers.first}) - .shape; - for (optimizer_tensor_t const &optimizer_tensor : - tensor_to_optimizers.second) { - if (is_allocated_tensor_backing_valid( - TensorTypeVariant{optimizer_tensor}, - allocated_tensors.tensor_type_backings, - tensor_guid_array_shape)) { - tensors_in_mappings.insert(TensorTypeVariant{optimizer_tensor}); - } else { + if (tensor_attrs.count(tensor_to_optimizers.first)) { + if (tensor_attrs.at(tensor_to_optimizers.first).create_gradients == + CreateGrad::NO) { return false; } + + ArrayShape tensor_guid_array_shape = ArrayShape{tensor_attrs.at(tensor_to_optimizers.first).shape}; + for (optimizer_tensor_t const &optimizer_tensor : + tensor_to_optimizers.second) { + if (is_allocated_tensor_backing_valid( + TensorTypeVariant{optimizer_tensor}, + allocated_tensors.tensor_type_backings, + tensor_guid_array_shape)) { + tensors_in_mappings.insert(TensorTypeVariant{optimizer_tensor}); + } else { + return false; + } + } } } @@ -125,4 +130,10 @@ bool are_allocated_optimizer_tensors_valid( return true; } +bool are_allocated_tensors_valid(AllocatedTensors const & allocated_tensors, std::unordered_map const & tensor_attrs) { + return are_allocated_forward_tensors_valid(allocated_tensors, tensor_attrs) + && are_allocated_gradient_tensors_valid(allocated_tensors, tensor_attrs) + && are_allocated_optimizer_tensors_valid(allocated_tensors, tensor_attrs); +} + } // namespace FlexFlow diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc index 31418c6bea..c5c2fafa9d 100644 --- a/lib/local-execution/src/local_cost_estimator.cc +++ b/lib/local-execution/src/local_cost_estimator.cc @@ -89,19 +89,13 @@ CostDetails LocalCostEstimator::estimate_cost( LocalTrainingBacking local_backing( allocator, + AllocatedTensors{{}, {}, {}}, computation_graph, - LocalTensorBacking{}, - LocalArgsBacking{this->runtime_arg_config}); - - allocate_all_computation_graph_tensors(local_backing.local_tensor_backing, - local_backing.gradient_tensor_source, - local_backing.computation_graph, - local_backing.allocator); + this->runtime_arg_config); // execute layer layer_guid_t operator_layer_guid = get_layer_by_name(computation_graph, "operator"); - execute_init(local_backing, operator_layer_guid); float fwd = execute_forward(local_backing, operator_layer_guid).value(); float bwd = execute_backward(local_backing, operator_layer_guid).value(); diff --git a/lib/local-execution/src/local_tensor_backing.cc b/lib/local-execution/src/local_tensor_backing.cc index 67bbd59c3b..c37cfc5fc4 100644 --- a/lib/local-execution/src/local_tensor_backing.cc +++ b/lib/local-execution/src/local_tensor_backing.cc @@ -61,21 +61,25 @@ lowered_tensor_t LocalTensorBacking::insert_tensor(TensorTypeVariant const &tensor_type) { lowered_tensor_t lowered_tensor = this->lowered_tensor_source.new_lowered_tensor(); - tensor_type.visit(overload{ + tensor_type.visit(overload{ [&](tensor_guid_t const &tensor_guid) { this->tensor_lowering_mapping.insert({tensor_guid, lowered_tensor}); + return std::nullopt; }, [&](gradient_tensor_t const &gradient_tensor) { this->gradient_tensor_lowering_mapping.insert( {gradient_tensor, lowered_tensor}); + return std::nullopt; }, [&](optimizer_tensor_t const &optimizer_tensor) { this->optimizer_tensor_lowering_mapping.insert( {optimizer_tensor, lowered_tensor}); + return std::nullopt; }, [&](loss_tensor_t const &loss_tensor) { this->loss_tensor_lowering_mapping.insert( {loss_tensor, lowered_tensor}); + return std::nullopt; }, [&](auto const &any_tensor) { throw mk_runtime_error( @@ -88,16 +92,16 @@ GenericTensorAccessorW LocalTensorBacking::get_tensor(TensorTypeVariant const &tensor_type) const { lowered_tensor_t lowered_tensor = tensor_type.visit( overload{[&](tensor_guid_t const &tensor_guid) { - this->tensor_lowering_mapping.at(tensor_guid); + return this->tensor_lowering_mapping.at(tensor_guid); }, [&](gradient_tensor_t const &gradient_tensor) { - this->gradient_tensor_lowering_mapping.at(gradient_tensor); + return this->gradient_tensor_lowering_mapping.at(gradient_tensor); }, [&](optimizer_tensor_t const &optimizer_tensor) { - this->optimizer_tensor_lowering_mapping.at(optimizer_tensor); + return this->optimizer_tensor_lowering_mapping.at(optimizer_tensor); }, [&](loss_tensor_t const &loss_tensor) { - this->loss_tensor_lowering_mapping.at(loss_tensor); + return this->loss_tensor_lowering_mapping.at(loss_tensor); }, [&](auto const &any_tensor) { throw mk_runtime_error( @@ -108,18 +112,18 @@ GenericTensorAccessorW UnallocatedTensors generate_unallocated_tensors(AllocatedTensors const &allocated_tensors, - ComputationGraph const &computation_graph, + std::unordered_map const &tensor_attrs_mapping, GradientTensorSource &gradient_tensor_source) { - assert(are_allocated_forward_tensors_valid(allocated_tensors, - computation_graph)); - assert(are_allocated_gradient_tensors_valid(allocated_tensors, - computation_graph)); + + assert(are_allocated_tensors_valid( + allocated_tensors, tensor_attrs_mapping)); std::unordered_map tensor_type_shapes; std::unordered_map gradient_mapping; - for (tensor_guid_t const &tensor_guid : get_all_tensors(computation_graph)) { - TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor_guid); + for (std::pair const &tensor_guid_attrs : tensor_attrs_mapping) { + tensor_guid_t tensor_guid = tensor_guid_attrs.first; + TensorAttrs tensor_attrs = tensor_guid_attrs.second; TensorTypeVariant tensor_guid_type = TensorTypeVariant{tensor_guid}; if (!allocated_tensors.tensor_type_backings.count(tensor_guid_type)) { tensor_type_shapes.insert({tensor_guid_type, tensor_attrs.shape}); @@ -140,15 +144,17 @@ UnallocatedTensors UnallocatedTensors generate_unallocated_tensors_with_optimizer( AllocatedTensors const &allocated_tensors, - ComputationGraph const &computation_graph, + std::unordered_map const &tensor_attrs_mapping, GradientTensorSource &gradient_tensor_source, OptimizerTensorSource &optimizer_tensor_source, OptimizerAttrs const &optimizer_attrs) { UnallocatedTensors unallocated_tensors = generate_unallocated_tensors( - allocated_tensors, computation_graph, gradient_tensor_source); - assert(are_allocated_optimizer_tensors_valid(allocated_tensors, - computation_graph)); + allocated_tensors, tensor_attrs_mapping, gradient_tensor_source); + + if (!get_num_optimizer_tensors(optimizer_attrs)) { + return unallocated_tensors; + } std::unordered_map tensor_type_shapes = unallocated_tensors.tensor_type_shapes; @@ -157,8 +163,9 @@ UnallocatedTensors generate_unallocated_tensors_with_optimizer( std::unordered_map> optimizer_mapping; - for (tensor_guid_t const &tensor_guid : get_all_tensors(computation_graph)) { - TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor_guid); + for (std::pair const &tensor_guid_attrs : tensor_attrs_mapping) { + tensor_guid_t tensor_guid = tensor_guid_attrs.first; + TensorAttrs tensor_attrs = tensor_guid_attrs.second; if (tensor_attrs.create_gradients == CreateGrad::YES && !allocated_tensors.optimizer_mapping.count(tensor_guid)) { std::vector optimizer_tensors; diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index 8a0dc825eb..cb22240b7f 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -24,7 +24,7 @@ LocalTrainingBacking::LocalTrainingBacking( local_tensor_backing( allocated_tensors, generate_unallocated_tensors(allocated_tensors, - computation_graph, + get_all_tensor_attrs(this->computation_graph), this->gradient_tensor_source), allocator), local_args_backing(initialize_args_backing(this->task_registry, @@ -43,7 +43,7 @@ LocalTrainingBacking::LocalTrainingBacking( local_tensor_backing(allocated_tensors, generate_unallocated_tensors_with_optimizer( allocated_tensors, - computation_graph, + get_all_tensor_attrs(this->computation_graph), this->gradient_tensor_source, this->optimizer_tensor_source, optimizer_attrs), diff --git a/lib/local-execution/test/CMakeLists.txt b/lib/local-execution/test/CMakeLists.txt index 930ab5c4e2..6e3d890176 100644 --- a/lib/local-execution/test/CMakeLists.txt +++ b/lib/local-execution/test/CMakeLists.txt @@ -2,7 +2,9 @@ ff_add_test_executable( NAME local-execution-tests SRC_PATTERNS - src/*.cc + src/test_allocated_tensors.cc + src/test_unallocated_tensors.cc + src/test_utils.cc PRIVATE_INCLUDE src/ DEPS diff --git a/lib/local-execution/test/src/test_allocated_tensors.cc b/lib/local-execution/test/src/test_allocated_tensors.cc new file mode 100644 index 0000000000..59537cfae1 --- /dev/null +++ b/lib/local-execution/test/src/test_allocated_tensors.cc @@ -0,0 +1,221 @@ +#include "local-execution/allocated_tensors.h" +#include "local-execution/local_cpu_allocator.h" +#include "local-execution/gradient_tensor_source.h" +#include "local-execution/optimizer_tensor_source.h" +#include "local-execution/loss_tensor_source.h" +#include "pcg/computation_graph.dtg.h" +#include "test/utils/doctest/fmt/pair.h" +#include "test/utils/doctest/fmt/unordered_map.h" +#include "test/utils/doctest/fmt/variant.h" +#include "test/utils/doctest/fmt/vector.h" +#include "test_utils.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("AllocatedTensors") { + MockTensorGuidSource tensor_guid_source; + GradientTensorSource gradient_tensor_source; + OptimizerTensorSource optimizer_tensor_source; + LossTensorSource loss_tensor_source; + + Allocator allocator = create_local_cpu_memory_allocator(); + + tensor_guid_t mock_tensor_1 = tensor_guid_source.new_mock_tensor_guid(); + tensor_guid_t mock_tensor_2 = tensor_guid_source.new_mock_tensor_guid(); + tensor_guid_t mock_tensor_3_with_grad = tensor_guid_source.new_mock_tensor_guid(); + tensor_guid_t dangling_tensor = tensor_guid_source.new_mock_tensor_guid(); + + TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{ + TensorShape{ + TensorDims{ + FFOrdered{16, 10} + }, + DataType::FLOAT + }, + std::nullopt, + std::nullopt, + CreateGrad::NO + }; + TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{ + TensorShape{ + TensorDims{ + FFOrdered{16, 20} + }, + DataType::FLOAT + }, + std::nullopt, + std::nullopt, + CreateGrad::NO + }; + TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{ + TensorShape{ + TensorDims{ + FFOrdered{16, 30} + }, + DataType::FLOAT + }, + std::nullopt, + std::nullopt, + CreateGrad::YES + }; + + GenericTensorAccessorW tensor_backing_1 = allocator.allocate_tensor(tensor_attrs_1_no_grad.shape); + GenericTensorAccessorW tensor_backing_2 = allocator.allocate_tensor(tensor_attrs_2_no_grad.shape); + GenericTensorAccessorW tensor_backing_3 = allocator.allocate_tensor(tensor_attrs_3_with_grad.shape); + + std::unordered_map tensor_attrs_mapping = { + {mock_tensor_1, tensor_attrs_1_no_grad}, + {mock_tensor_2, tensor_attrs_2_no_grad}, + {mock_tensor_3_with_grad, tensor_attrs_3_with_grad}, + }; + + SUBCASE("Trivial tensors") { + SUBCASE("Empty") { + AllocatedTensors allocated_tensors = AllocatedTensors{{}, {}, {}}; + bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); + CHECK (result == true); + } + + SUBCASE("Loss tensor") { + loss_tensor_t loss_tensor = loss_tensor_source.new_loss_tensor(); + AllocatedTensors allocated_tensors = AllocatedTensors{{ + {TensorTypeVariant{loss_tensor}, tensor_backing_1} + }, {}, {}}; + bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); + CHECK (result == true); + } + } + + SUBCASE("Forward tensors") { + SUBCASE("Correct forward tensor") { + AllocatedTensors allocated_tensors = AllocatedTensors{{ + {TensorTypeVariant{mock_tensor_1}, tensor_backing_1} + }, {}, {}}; + bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); + CHECK (result == true); + } + + SUBCASE("Incorrect forward tensor") { + AllocatedTensors allocated_tensors = AllocatedTensors{{ + {TensorTypeVariant{mock_tensor_1}, tensor_backing_2} + }, {}, {}}; + bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); + CHECK (result == false); + } + + SUBCASE("Dangling tensor guid") { + AllocatedTensors allocated_tensors = AllocatedTensors{{ + {TensorTypeVariant{dangling_tensor}, tensor_backing_1}, + }, {}, {}}; + bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); + CHECK (result == false); + } + } + + SUBCASE("Gradient tensors") { + gradient_tensor_t grad_tensor_3 = gradient_tensor_source.new_gradient_tensor(); + + SUBCASE("Gradient tensor") { + AllocatedTensors allocated_tensors = AllocatedTensors{{ + {TensorTypeVariant{grad_tensor_3}, tensor_backing_3} + }, {{mock_tensor_3_with_grad, grad_tensor_3}}, {}}; + bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); + CHECK (result == true); + } + + SUBCASE("Dangling gradient tensor") { + AllocatedTensors allocated_tensors = AllocatedTensors{{ + {TensorTypeVariant{grad_tensor_3}, tensor_backing_3} + }, {}, {}}; + bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); + CHECK (result == false); + } + + SUBCASE("Dangling gradient tensor in mapping") { + AllocatedTensors allocated_tensors = AllocatedTensors{{}, { + {mock_tensor_3_with_grad, grad_tensor_3} + }, {}}; + bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); + CHECK (result == false); + } + + SUBCASE("Gradient allocated for forward tensor without gradient") { + AllocatedTensors allocated_tensors = AllocatedTensors{{ + {TensorTypeVariant{grad_tensor_3}, tensor_backing_3} + }, {{mock_tensor_2, grad_tensor_3}}, {}}; + bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); + CHECK (result == false); + } + + SUBCASE("Gradient tensor with wrong shape") { + AllocatedTensors allocated_tensors = AllocatedTensors{{ + {TensorTypeVariant{grad_tensor_3}, tensor_backing_2} + }, {{mock_tensor_3_with_grad, grad_tensor_3}}, {}}; + bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); + CHECK (result == false); + } + + SUBCASE("Gradient tensor with dangling tensor guid") { + AllocatedTensors allocated_tensors = AllocatedTensors{{ + {TensorTypeVariant{grad_tensor_3}, tensor_backing_3} + }, {{dangling_tensor, grad_tensor_3}}, {}}; + bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); + CHECK (result == false); + } + } + + SUBCASE("Optimizer tensors") { + optimizer_tensor_t optimizer_tensor_3 = optimizer_tensor_source.new_optimizer_tensor(); + + SUBCASE("Optimizer tensor") { + AllocatedTensors allocated_tensors = AllocatedTensors{{ + {TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3} + }, {}, {{mock_tensor_3_with_grad, {optimizer_tensor_3}}}}; + bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); + CHECK (result == true); + } + + SUBCASE("Dangling optimizer tensor") { + AllocatedTensors allocated_tensors = AllocatedTensors{{ + {TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3} + }, {}, {}}; + bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); + CHECK (result == false); + } + + SUBCASE("Dangling optimizer tensor in mapping") { + AllocatedTensors allocated_tensors = AllocatedTensors{{}, {}, { + {mock_tensor_3_with_grad, {optimizer_tensor_3}} + }}; + bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); + CHECK (result == false); + } + + SUBCASE("Optimizer allocated for forward tensor without gradient") { + AllocatedTensors allocated_tensors = AllocatedTensors{{ + {TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3} + }, {}, {{mock_tensor_2, {optimizer_tensor_3}}}}; + bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); + CHECK (result == false); + } + + SUBCASE("Optimizer tensor with wrong shape") { + AllocatedTensors allocated_tensors = AllocatedTensors{{ + {TensorTypeVariant{optimizer_tensor_3}, tensor_backing_2} + }, {}, {{mock_tensor_3_with_grad, {optimizer_tensor_3}}}}; + bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); + CHECK (result == false); + } + + SUBCASE("Optimizer tensor with dangling tensor guid") { + AllocatedTensors allocated_tensors = AllocatedTensors{{ + {TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3} + }, {}, {{dangling_tensor, {optimizer_tensor_3}}}}; + bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); + CHECK (result == false); + } + } + } +} diff --git a/lib/local-execution/test/src/test_unallocated_tensors.cc b/lib/local-execution/test/src/test_unallocated_tensors.cc new file mode 100644 index 0000000000..9802821f3e --- /dev/null +++ b/lib/local-execution/test/src/test_unallocated_tensors.cc @@ -0,0 +1,383 @@ +#include "local-execution/allocated_tensors.h" +#include "local-execution/local_tensor_backing.h" +#include "local-execution/local_cpu_allocator.h" +#include "local-execution/gradient_tensor_source.h" +#include "local-execution/optimizer_tensor_source.h" +#include "local-execution/loss_tensor_source.h" +#include "pcg/computation_graph.dtg.h" +#include "test/utils/doctest/fmt/pair.h" +#include "test/utils/doctest/fmt/unordered_map.h" +#include "test/utils/doctest/fmt/variant.h" +#include "test/utils/doctest/fmt/vector.h" +#include "test_utils.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("UnallocatedTensors") { + MockTensorGuidSource tensor_guid_source; + OptimizerTensorSource optimizer_tensor_source; + + Allocator allocator = create_local_cpu_memory_allocator(); + + tensor_guid_t mock_tensor_1 = tensor_guid_source.new_mock_tensor_guid(); + tensor_guid_t mock_tensor_2 = tensor_guid_source.new_mock_tensor_guid(); + tensor_guid_t mock_tensor_3_with_grad = tensor_guid_source.new_mock_tensor_guid(); + + TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{ + TensorShape{ + TensorDims{ + FFOrdered{16, 10} + }, + DataType::FLOAT + }, + std::nullopt, + std::nullopt, + CreateGrad::NO + }; + TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{ + TensorShape{ + TensorDims{ + FFOrdered{16, 20} + }, + DataType::FLOAT + }, + std::nullopt, + std::nullopt, + CreateGrad::NO + }; + TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{ + TensorShape{ + TensorDims{ + FFOrdered{16, 30} + }, + DataType::FLOAT + }, + std::nullopt, + std::nullopt, + CreateGrad::YES + }; + + GenericTensorAccessorW tensor_backing_1 = allocator.allocate_tensor(tensor_attrs_1_no_grad.shape); + GenericTensorAccessorW tensor_backing_2 = allocator.allocate_tensor(tensor_attrs_2_no_grad.shape); + GenericTensorAccessorW tensor_backing_3 = allocator.allocate_tensor(tensor_attrs_3_with_grad.shape); + + std::unordered_map tensor_attrs_mapping = { + {mock_tensor_1, tensor_attrs_1_no_grad}, + {mock_tensor_2, tensor_attrs_2_no_grad}, + {mock_tensor_3_with_grad, tensor_attrs_3_with_grad}, + }; + + SUBCASE("Without optimizer") { + SUBCASE("AllocatedTensors is empty") { + AllocatedTensors empty = AllocatedTensors{{}, {}, {}}; + GradientTensorSource gradient_tensor_source; + UnallocatedTensors result = generate_unallocated_tensors(empty, tensor_attrs_mapping, gradient_tensor_source); + + GradientTensorSource mock_gradient_tensor_source; + gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor(); + std::unordered_map correct_tensor_type_shapes = { + {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape}, + {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape}, + {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape}, + {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape}, + }; + UnallocatedTensors correct = UnallocatedTensors{ + correct_tensor_type_shapes, + {{mock_tensor_3_with_grad, grad_tensor}}, + {} + }; + CHECK (result == correct); + } + + SUBCASE("AllocatedTensors contains only 1 forward tensor") { + AllocatedTensors allocated_forward_tensors = AllocatedTensors{{ + {TensorTypeVariant{mock_tensor_1}, tensor_backing_1}, + }, {}, {}}; + GradientTensorSource gradient_tensor_source; + UnallocatedTensors result = generate_unallocated_tensors(allocated_forward_tensors, tensor_attrs_mapping, gradient_tensor_source); + + GradientTensorSource mock_gradient_tensor_source; + gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor(); + std::unordered_map correct_tensor_type_shapes = { + {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape}, + {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape}, + {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape}, + }; + UnallocatedTensors correct = UnallocatedTensors{ + correct_tensor_type_shapes, + {{mock_tensor_3_with_grad, grad_tensor}}, + {} + }; + CHECK (result == correct); + } + + SUBCASE("AllocatedTensors contains only forward tensors") { + AllocatedTensors allocated_forward_tensors = AllocatedTensors{{ + {TensorTypeVariant{mock_tensor_1}, tensor_backing_1}, + {TensorTypeVariant{mock_tensor_2}, tensor_backing_2}, + {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_backing_3}, + }, {}, {}}; + GradientTensorSource gradient_tensor_source; + UnallocatedTensors result = generate_unallocated_tensors(allocated_forward_tensors, tensor_attrs_mapping, gradient_tensor_source); + + GradientTensorSource mock_gradient_tensor_source; + gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor(); + std::unordered_map correct_tensor_type_shapes = { + {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape}, + }; + UnallocatedTensors correct = UnallocatedTensors{ + correct_tensor_type_shapes, + {{mock_tensor_3_with_grad, grad_tensor}}, + {} + }; + CHECK (result == correct); + } + + SUBCASE("AllocatedTensors contains only gradient tensor") { + GradientTensorSource gradient_tensor_source; + gradient_tensor_t grad_tensor = gradient_tensor_source.new_gradient_tensor(); + AllocatedTensors allocated_forward_tensors = AllocatedTensors{{ + {TensorTypeVariant{grad_tensor}, tensor_backing_3}, + }, {{mock_tensor_3_with_grad, grad_tensor}}, {}}; + UnallocatedTensors result = generate_unallocated_tensors(allocated_forward_tensors, tensor_attrs_mapping, gradient_tensor_source); + + std::unordered_map correct_tensor_type_shapes = { + {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape}, + {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape}, + {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape}, + }; + UnallocatedTensors correct = UnallocatedTensors{ + correct_tensor_type_shapes, + {}, + {} + }; + CHECK (result == correct); + } + + SUBCASE("AllocatedTensors contains mixture") { + GradientTensorSource gradient_tensor_source; + gradient_tensor_t grad_tensor = gradient_tensor_source.new_gradient_tensor(); + AllocatedTensors allocated_forward_tensors = AllocatedTensors{{ + {TensorTypeVariant{mock_tensor_1}, tensor_backing_1}, + {TensorTypeVariant{grad_tensor}, tensor_backing_3}, + }, {{mock_tensor_3_with_grad, grad_tensor}}, {}}; + UnallocatedTensors result = generate_unallocated_tensors(allocated_forward_tensors, tensor_attrs_mapping, gradient_tensor_source); + + std::unordered_map correct_tensor_type_shapes = { + {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape}, + {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape}, + }; + UnallocatedTensors correct = UnallocatedTensors{ + correct_tensor_type_shapes, + {}, + {} + }; + CHECK (result == correct); + } + + SUBCASE("Fully AllocatedTensors") { + GradientTensorSource gradient_tensor_source; + gradient_tensor_t grad_tensor = gradient_tensor_source.new_gradient_tensor(); + AllocatedTensors allocated_forward_tensors = AllocatedTensors{{ + {TensorTypeVariant{mock_tensor_1}, tensor_backing_1}, + {TensorTypeVariant{mock_tensor_2}, tensor_backing_2}, + {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_backing_3}, + {TensorTypeVariant{grad_tensor}, tensor_backing_3}, + }, {{mock_tensor_3_with_grad, grad_tensor}}, {}}; + UnallocatedTensors result = generate_unallocated_tensors(allocated_forward_tensors, tensor_attrs_mapping, gradient_tensor_source); + + UnallocatedTensors correct = UnallocatedTensors{{}, {}, {}}; + CHECK (result == correct); + } + } + + SUBCASE("With optimizer") { + SUBCASE("SGD Attrs") { + SUBCASE("without momentum") { + double momentum = 0.0; + OptimizerAttrs attrs = OptimizerAttrs{SGDOptimizerAttrs{0.0, momentum, false, 0.0}}; + AllocatedTensors empty = AllocatedTensors{{}, {}, {}}; + GradientTensorSource gradient_tensor_source; + OptimizerTensorSource optimizer_tensour_source; + UnallocatedTensors result = generate_unallocated_tensors_with_optimizer(empty, + tensor_attrs_mapping, gradient_tensor_source, optimizer_tensor_source, attrs); + + GradientTensorSource mock_gradient_tensor_source; + UnallocatedTensors correct = generate_unallocated_tensors(empty, tensor_attrs_mapping, mock_gradient_tensor_source); + CHECK (result == correct); + } + SUBCASE("with momentum") { + double momentum = 0.9; + OptimizerAttrs attrs = OptimizerAttrs{SGDOptimizerAttrs{0.0, momentum, false, 0.0}}; + + SUBCASE("unallocated") { + AllocatedTensors empty = AllocatedTensors{{}, {}, {}}; + GradientTensorSource gradient_tensor_source; + OptimizerTensorSource optimizer_tensour_source; + UnallocatedTensors result = generate_unallocated_tensors_with_optimizer(empty, + tensor_attrs_mapping, gradient_tensor_source, optimizer_tensor_source, attrs); + + GradientTensorSource mock_gradient_tensor_source; + gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor(); + OptimizerTensorSource mock_optimizer_tensour_source; + optimizer_tensor_t optimizer_tensor = mock_optimizer_tensour_source.new_optimizer_tensor(); + + std::unordered_map correct_tensor_type_shapes = { + {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape}, + {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape}, + {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape}, + {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape}, + {TensorTypeVariant{optimizer_tensor}, tensor_attrs_3_with_grad.shape}, + }; + UnallocatedTensors correct = UnallocatedTensors{ + correct_tensor_type_shapes, + {{mock_tensor_3_with_grad, grad_tensor}}, + {{mock_tensor_3_with_grad, {optimizer_tensor}}} + }; + + CHECK (result == correct); + } + + SUBCASE("allocated") { + OptimizerTensorSource optimizer_tensour_source; + optimizer_tensor_t optimizer_tensor = optimizer_tensour_source.new_optimizer_tensor(); + AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{{ + {TensorTypeVariant{optimizer_tensor}, tensor_backing_3} + }, {}, { + {mock_tensor_3_with_grad, {optimizer_tensor}} + }}; + GradientTensorSource gradient_tensor_source; + UnallocatedTensors result = generate_unallocated_tensors_with_optimizer(allocated_optimizer_tensor, + tensor_attrs_mapping, gradient_tensor_source, optimizer_tensor_source, attrs); + + GradientTensorSource mock_gradient_tensor_source; + gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor(); + + std::unordered_map correct_tensor_type_shapes = { + {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape}, + {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape}, + {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape}, + {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape}, + }; + UnallocatedTensors correct = UnallocatedTensors{ + correct_tensor_type_shapes, + {{mock_tensor_3_with_grad, grad_tensor}}, + {} + }; + + CHECK (result == correct); + } + } + } + SUBCASE("Adam Attrs") { + OptimizerAttrs attrs = OptimizerAttrs{AdamOptimizerAttrs{/*alpha=*/0.001, + /*beta1=*/0.9, + /*beta2=*/0.999, + /*weight_decay=*/0.001, + /*alpha_t=*/0.001, + /*beta_t=*/0.9, + /*beta2_t=*/0.999, + /*epsilon=*/1e-8}}; + SUBCASE("Empty") { + AllocatedTensors empty = AllocatedTensors{{}, {}, {}}; + GradientTensorSource gradient_tensor_source; + OptimizerTensorSource optimizer_tensour_source; + UnallocatedTensors result = generate_unallocated_tensors_with_optimizer(empty, + tensor_attrs_mapping, gradient_tensor_source, optimizer_tensor_source, attrs); + + GradientTensorSource mock_gradient_tensor_source; + gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor(); + OptimizerTensorSource mock_optimizer_tensour_source; + optimizer_tensor_t optimizer_tensor_1 = mock_optimizer_tensour_source.new_optimizer_tensor(); + optimizer_tensor_t optimizer_tensor_2 = mock_optimizer_tensour_source.new_optimizer_tensor(); + + std::unordered_map correct_tensor_type_shapes = { + {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape}, + {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape}, + {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape}, + {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape}, + {TensorTypeVariant{optimizer_tensor_1}, tensor_attrs_3_with_grad.shape}, + {TensorTypeVariant{optimizer_tensor_2}, tensor_attrs_3_with_grad.shape}, + }; + UnallocatedTensors correct = UnallocatedTensors{ + correct_tensor_type_shapes, + {{mock_tensor_3_with_grad, grad_tensor}}, + {{mock_tensor_3_with_grad, {optimizer_tensor_1, optimizer_tensor_2}}} + }; + + CHECK (result == correct); + } + SUBCASE("Partially allocated") { + OptimizerTensorSource optimizer_tensour_source; + optimizer_tensor_t optimizer_tensor_1 = optimizer_tensour_source.new_optimizer_tensor(); + AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{{ + {TensorTypeVariant{optimizer_tensor_1}, tensor_backing_3} + }, {}, { + {mock_tensor_3_with_grad, {optimizer_tensor_1}} + }}; + GradientTensorSource gradient_tensor_source; + UnallocatedTensors result = generate_unallocated_tensors_with_optimizer(allocated_optimizer_tensor, + tensor_attrs_mapping, gradient_tensor_source, optimizer_tensor_source, attrs); + + GradientTensorSource mock_gradient_tensor_source; + gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor(); + OptimizerTensorSource mock_optimizer_tensour_source; + optimizer_tensor_source.new_optimizer_tensor(); + optimizer_tensor_t optimizer_tensor_2 = optimizer_tensour_source.new_optimizer_tensor(); + + std::unordered_map correct_tensor_type_shapes = { + {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape}, + {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape}, + {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape}, + {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape}, + {TensorTypeVariant{optimizer_tensor_2}, tensor_attrs_3_with_grad.shape}, + }; + UnallocatedTensors correct = UnallocatedTensors{ + correct_tensor_type_shapes, + {{mock_tensor_3_with_grad, grad_tensor}}, + {{mock_tensor_3_with_grad, {optimizer_tensor_2}}} + }; + + CHECK (result == correct); + } + + SUBCASE("Fully allocated") { + OptimizerTensorSource optimizer_tensour_source; + optimizer_tensor_t optimizer_tensor_1 = optimizer_tensour_source.new_optimizer_tensor(); + optimizer_tensor_t optimizer_tensor_2 = optimizer_tensour_source.new_optimizer_tensor(); + AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{{ + {TensorTypeVariant{optimizer_tensor_1}, tensor_backing_3}, + {TensorTypeVariant{optimizer_tensor_2}, tensor_backing_3} + }, {}, { + {mock_tensor_3_with_grad, {optimizer_tensor_1, optimizer_tensor_2}} + }}; + GradientTensorSource gradient_tensor_source; + UnallocatedTensors result = generate_unallocated_tensors_with_optimizer(allocated_optimizer_tensor, + tensor_attrs_mapping, gradient_tensor_source, optimizer_tensor_source, attrs); + + GradientTensorSource mock_gradient_tensor_source; + gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor(); + OptimizerTensorSource mock_optimizer_tensour_source; + optimizer_tensor_source.new_optimizer_tensor(); + optimizer_tensor_t optimizer_tensor_2 = optimizer_tensour_source.new_optimizer_tensor(); + + std::unordered_map correct_tensor_type_shapes = { + {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape}, + {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape}, + {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape}, + {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape}, + }; + UnallocatedTensors correct = UnallocatedTensors{ + correct_tensor_type_shapes, + {{mock_tensor_3_with_grad, grad_tensor}}, + {} + }; + + CHECK (result == correct); + } + } + } + } +} diff --git a/lib/local-execution/test/src/test_utils.cc b/lib/local-execution/test/src/test_utils.cc index 095e1272a2..b7a4e16b97 100644 --- a/lib/local-execution/test/src/test_utils.cc +++ b/lib/local-execution/test/src/test_utils.cc @@ -1,4 +1,5 @@ #include "test_utils.h" +#include "pcg/tensor_guid_t.dtg.h" namespace FlexFlow { @@ -6,4 +7,13 @@ PerDeviceFFHandle get_mock_per_device_ff_handle() { return {nullptr, nullptr, nullptr, 0, false}; } +size_t MockTensorGuidSource::next_available_mock_tensor_guid = 0; + +MockTensorGuidSource::MockTensorGuidSource() {} + +tensor_guid_t MockTensorGuidSource::new_mock_tensor_guid() { + size_t next_guid = MockTensorGuidSource::next_available_mock_tensor_guid++; + return tensor_guid_t{DataflowOutput{Node{0}, nonnegative_int{next_guid}}}; +} + } // namespace FlexFlow diff --git a/lib/local-execution/test/src/test_utils.h b/lib/local-execution/test/src/test_utils.h index 9a7b3f5991..6d6dcf5afe 100644 --- a/lib/local-execution/test/src/test_utils.h +++ b/lib/local-execution/test/src/test_utils.h @@ -5,6 +5,16 @@ namespace FlexFlow { +struct MockTensorGuidSource { +public: + MockTensorGuidSource(); + + tensor_guid_t new_mock_tensor_guid(); + +private: + static size_t next_available_mock_tensor_guid; +}; + PerDeviceFFHandle get_mock_per_device_ff_handle(); } // namespace FlexFlow diff --git a/lib/pcg/include/pcg/computation_graph.h b/lib/pcg/include/pcg/computation_graph.h index c473ae1f40..589496e61b 100644 --- a/lib/pcg/include/pcg/computation_graph.h +++ b/lib/pcg/include/pcg/computation_graph.h @@ -42,6 +42,8 @@ std::vector get_incoming_weights(ComputationGraph const &, layer_guid_t const &); std::unordered_set get_all_tensors(ComputationGraph const &); +std::unordered_map + get_all_tensor_attrs(ComputationGraph const &); std::unordered_set get_subgraph_incoming_edges(ComputationGraph const &, diff --git a/lib/pcg/src/pcg/computation_graph.cc b/lib/pcg/src/pcg/computation_graph.cc index 74448f18bc..728a150c2a 100644 --- a/lib/pcg/src/pcg/computation_graph.cc +++ b/lib/pcg/src/pcg/computation_graph.cc @@ -135,6 +135,16 @@ std::unordered_set get_all_tensors(ComputationGraph const &cg) { [](DataflowOutput const &t) { return tensor_guid_t(t); }); } +std::unordered_map + get_all_tensor_attrs(ComputationGraph const &cg) { + std::unordered_set all_tensors = get_all_tensors(cg); + std::unordered_map all_tensor_attrs; + for (tensor_guid_t const &tensor_guid : all_tensors) { + all_tensor_attrs.insert({tensor_guid, get_tensor_attrs(cg, tensor_guid)}); + } + return all_tensor_attrs; +} + std::unordered_set get_subgraph_incoming_edges( ComputationGraph const &cg, std::unordered_set const &subgraph_nodes) { diff --git a/lib/utils/include/utils/required_core.h b/lib/utils/include/utils/required_core.h index 7a7abcd2c4..8ac772439f 100644 --- a/lib/utils/include/utils/required_core.h +++ b/lib/utils/include/utils/required_core.h @@ -232,7 +232,7 @@ namespace std { template struct hash<::FlexFlow::req> { size_t operator()(::FlexFlow::req const &r) const { - return get_std_hash(static_cast(r)); + return ::FlexFlow::get_std_hash(static_cast(r)); } }; From a0f81132754d91f7cacf6250b2fb38c42d58f7fc Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Thu, 13 Feb 2025 12:25:58 -0800 Subject: [PATCH 42/91] Fix nonnegative --- lib/local-execution/test/src/test_allocated_tensors.cc | 6 +++--- lib/local-execution/test/src/test_unallocated_tensors.cc | 9 +++------ lib/local-execution/test/src/test_utils.h | 1 + 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/lib/local-execution/test/src/test_allocated_tensors.cc b/lib/local-execution/test/src/test_allocated_tensors.cc index 59537cfae1..f4f0664141 100644 --- a/lib/local-execution/test/src/test_allocated_tensors.cc +++ b/lib/local-execution/test/src/test_allocated_tensors.cc @@ -30,7 +30,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{ TensorShape{ TensorDims{ - FFOrdered{16, 10} + FFOrdered{16_n, 10_n} }, DataType::FLOAT }, @@ -41,7 +41,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{ TensorShape{ TensorDims{ - FFOrdered{16, 20} + FFOrdered{16_n, 20_n} }, DataType::FLOAT }, @@ -52,7 +52,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{ TensorShape{ TensorDims{ - FFOrdered{16, 30} + FFOrdered{16_n, 30_n} }, DataType::FLOAT }, diff --git a/lib/local-execution/test/src/test_unallocated_tensors.cc b/lib/local-execution/test/src/test_unallocated_tensors.cc index 9802821f3e..65aabc2043 100644 --- a/lib/local-execution/test/src/test_unallocated_tensors.cc +++ b/lib/local-execution/test/src/test_unallocated_tensors.cc @@ -28,7 +28,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{ TensorShape{ TensorDims{ - FFOrdered{16, 10} + FFOrdered{16_n, 10_n} }, DataType::FLOAT }, @@ -39,7 +39,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{ TensorShape{ TensorDims{ - FFOrdered{16, 20} + FFOrdered{16_n, 20_n} }, DataType::FLOAT }, @@ -50,7 +50,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{ TensorShape{ TensorDims{ - FFOrdered{16, 30} + FFOrdered{16_n, 30_n} }, DataType::FLOAT }, @@ -359,9 +359,6 @@ TEST_SUITE(FF_TEST_SUITE) { GradientTensorSource mock_gradient_tensor_source; gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor(); - OptimizerTensorSource mock_optimizer_tensour_source; - optimizer_tensor_source.new_optimizer_tensor(); - optimizer_tensor_t optimizer_tensor_2 = optimizer_tensour_source.new_optimizer_tensor(); std::unordered_map correct_tensor_type_shapes = { {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape}, diff --git a/lib/local-execution/test/src/test_utils.h b/lib/local-execution/test/src/test_utils.h index 6d6dcf5afe..056e92687c 100644 --- a/lib/local-execution/test/src/test_utils.h +++ b/lib/local-execution/test/src/test_utils.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_LOCAL_EXECUTION_TEST_UTILS #include "kernels/ff_handle.h" +#include "pcg/tensor_guid_t.dtg.h" namespace FlexFlow { From b1eab94bcd7ddf473f65da9b5afa01602115ec5b Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Thu, 13 Feb 2025 12:26:55 -0800 Subject: [PATCH 43/91] Format --- .../local-execution/allocated_tensors.h | 4 +- .../local-execution/local_tensor_backing.h | 21 +- lib/local-execution/src/allocated_tensors.cc | 30 +- .../src/local_cost_estimator.cc | 9 +- .../src/local_tensor_backing.cc | 54 +- .../src/local_training_backing.cc | 12 +- .../test/src/test_allocated_tensors.cc | 286 ++++---- .../test/src/test_unallocated_tensors.cc | 623 ++++++++++-------- 8 files changed, 582 insertions(+), 457 deletions(-) diff --git a/lib/local-execution/include/local-execution/allocated_tensors.h b/lib/local-execution/include/local-execution/allocated_tensors.h index 0d01350d9f..7581a159ad 100644 --- a/lib/local-execution/include/local-execution/allocated_tensors.h +++ b/lib/local-execution/include/local-execution/allocated_tensors.h @@ -16,7 +16,9 @@ bool are_allocated_optimizer_tensors_valid( AllocatedTensors const &, std::unordered_map const &); -bool are_allocated_tensors_valid(AllocatedTensors const &, std::unordered_map const &); +bool are_allocated_tensors_valid( + AllocatedTensors const &, + std::unordered_map const &); bool is_allocated_tensor_backing_valid( TensorTypeVariant const &, diff --git a/lib/local-execution/include/local-execution/local_tensor_backing.h b/lib/local-execution/include/local-execution/local_tensor_backing.h index a43f1a2c81..c05e39beae 100644 --- a/lib/local-execution/include/local-execution/local_tensor_backing.h +++ b/lib/local-execution/include/local-execution/local_tensor_backing.h @@ -53,16 +53,17 @@ struct LocalTensorBacking { LoweredTensorSource lowered_tensor_source; }; -UnallocatedTensors generate_unallocated_tensors(AllocatedTensors const &, - std::unordered_map const &, - GradientTensorSource &); - -UnallocatedTensors - generate_unallocated_tensors_with_optimizer(AllocatedTensors const &, - std::unordered_map const &, - GradientTensorSource &, - OptimizerTensorSource &, - OptimizerAttrs const &); +UnallocatedTensors generate_unallocated_tensors( + AllocatedTensors const &, + std::unordered_map const &, + GradientTensorSource &); + +UnallocatedTensors generate_unallocated_tensors_with_optimizer( + AllocatedTensors const &, + std::unordered_map const &, + GradientTensorSource &, + OptimizerTensorSource &, + OptimizerAttrs const &); TensorSlotsBacking construct_tensor_slots_backing(LocalTensorBacking const &, TaskBinding const &); diff --git a/lib/local-execution/src/allocated_tensors.cc b/lib/local-execution/src/allocated_tensors.cc index 19b149e7bd..3e249bf6d1 100644 --- a/lib/local-execution/src/allocated_tensors.cc +++ b/lib/local-execution/src/allocated_tensors.cc @@ -24,14 +24,11 @@ bool are_allocated_forward_tensors_valid( AllocatedTensors const &allocated_tensors, std::unordered_map const &tensor_attrs) { - std::unordered_set all_tensor_guids = - transform( - keys(filter_keys(allocated_tensors.tensor_type_backings, - [&](TensorTypeVariant const &k) { - return k.has(); - })), - [&](TensorTypeVariant const &t) { return t.get(); } - ); + std::unordered_set all_tensor_guids = transform( + keys(filter_keys( + allocated_tensors.tensor_type_backings, + [&](TensorTypeVariant const &k) { return k.has(); })), + [&](TensorTypeVariant const &t) { return t.get(); }); for (tensor_guid_t const &tensor_guid : all_tensor_guids) { if (tensor_attrs.count(tensor_guid)) { @@ -62,7 +59,8 @@ bool are_allocated_gradient_tensors_valid( return false; } - ArrayShape tensor_guid_array_shape = ArrayShape{tensor_attrs.at(tensor_to_grad.first).shape}; + ArrayShape tensor_guid_array_shape = + ArrayShape{tensor_attrs.at(tensor_to_grad.first).shape}; TensorTypeVariant gradient_tensor = TensorTypeVariant{tensor_to_grad.second}; if (is_allocated_tensor_backing_valid( @@ -103,7 +101,8 @@ bool are_allocated_optimizer_tensors_valid( return false; } - ArrayShape tensor_guid_array_shape = ArrayShape{tensor_attrs.at(tensor_to_optimizers.first).shape}; + ArrayShape tensor_guid_array_shape = + ArrayShape{tensor_attrs.at(tensor_to_optimizers.first).shape}; for (optimizer_tensor_t const &optimizer_tensor : tensor_to_optimizers.second) { if (is_allocated_tensor_backing_valid( @@ -130,10 +129,13 @@ bool are_allocated_optimizer_tensors_valid( return true; } -bool are_allocated_tensors_valid(AllocatedTensors const & allocated_tensors, std::unordered_map const & tensor_attrs) { - return are_allocated_forward_tensors_valid(allocated_tensors, tensor_attrs) - && are_allocated_gradient_tensors_valid(allocated_tensors, tensor_attrs) - && are_allocated_optimizer_tensors_valid(allocated_tensors, tensor_attrs); +bool are_allocated_tensors_valid( + AllocatedTensors const &allocated_tensors, + std::unordered_map const &tensor_attrs) { + return are_allocated_forward_tensors_valid(allocated_tensors, tensor_attrs) && + are_allocated_gradient_tensors_valid(allocated_tensors, + tensor_attrs) && + are_allocated_optimizer_tensors_valid(allocated_tensors, tensor_attrs); } } // namespace FlexFlow diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc index c5c2fafa9d..41a5df8d48 100644 --- a/lib/local-execution/src/local_cost_estimator.cc +++ b/lib/local-execution/src/local_cost_estimator.cc @@ -87,11 +87,10 @@ CostDetails LocalCostEstimator::estimate_cost( std::make_shared(create_local_cuda_memory_allocator()); Allocator allocator = Allocator(tracked_allocator_ptr); - LocalTrainingBacking local_backing( - allocator, - AllocatedTensors{{}, {}, {}}, - computation_graph, - this->runtime_arg_config); + LocalTrainingBacking local_backing(allocator, + AllocatedTensors{{}, {}, {}}, + computation_graph, + this->runtime_arg_config); // execute layer layer_guid_t operator_layer_guid = diff --git a/lib/local-execution/src/local_tensor_backing.cc b/lib/local-execution/src/local_tensor_backing.cc index c37cfc5fc4..be84d77906 100644 --- a/lib/local-execution/src/local_tensor_backing.cc +++ b/lib/local-execution/src/local_tensor_backing.cc @@ -90,38 +90,39 @@ lowered_tensor_t GenericTensorAccessorW LocalTensorBacking::get_tensor(TensorTypeVariant const &tensor_type) const { - lowered_tensor_t lowered_tensor = tensor_type.visit( - overload{[&](tensor_guid_t const &tensor_guid) { - return this->tensor_lowering_mapping.at(tensor_guid); - }, - [&](gradient_tensor_t const &gradient_tensor) { - return this->gradient_tensor_lowering_mapping.at(gradient_tensor); - }, - [&](optimizer_tensor_t const &optimizer_tensor) { - return this->optimizer_tensor_lowering_mapping.at(optimizer_tensor); - }, - [&](loss_tensor_t const &loss_tensor) { - return this->loss_tensor_lowering_mapping.at(loss_tensor); - }, - [&](auto const &any_tensor) { - throw mk_runtime_error( - fmt::format("Unhandled tensor type {}", any_tensor)); - }}); + lowered_tensor_t lowered_tensor = + tensor_type.visit(overload{ + [&](tensor_guid_t const &tensor_guid) { + return this->tensor_lowering_mapping.at(tensor_guid); + }, + [&](gradient_tensor_t const &gradient_tensor) { + return this->gradient_tensor_lowering_mapping.at(gradient_tensor); + }, + [&](optimizer_tensor_t const &optimizer_tensor) { + return this->optimizer_tensor_lowering_mapping.at(optimizer_tensor); + }, + [&](loss_tensor_t const &loss_tensor) { + return this->loss_tensor_lowering_mapping.at(loss_tensor); + }, + [&](auto const &any_tensor) { + throw mk_runtime_error( + fmt::format("Unhandled tensor type {}", any_tensor)); + }}); return this->tensor_backings.at(lowered_tensor); } -UnallocatedTensors - generate_unallocated_tensors(AllocatedTensors const &allocated_tensors, - std::unordered_map const &tensor_attrs_mapping, - GradientTensorSource &gradient_tensor_source) { +UnallocatedTensors generate_unallocated_tensors( + AllocatedTensors const &allocated_tensors, + std::unordered_map const &tensor_attrs_mapping, + GradientTensorSource &gradient_tensor_source) { - assert(are_allocated_tensors_valid( - allocated_tensors, tensor_attrs_mapping)); + assert(are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping)); std::unordered_map tensor_type_shapes; std::unordered_map gradient_mapping; - for (std::pair const &tensor_guid_attrs : tensor_attrs_mapping) { + for (std::pair const &tensor_guid_attrs : + tensor_attrs_mapping) { tensor_guid_t tensor_guid = tensor_guid_attrs.first; TensorAttrs tensor_attrs = tensor_guid_attrs.second; TensorTypeVariant tensor_guid_type = TensorTypeVariant{tensor_guid}; @@ -151,7 +152,7 @@ UnallocatedTensors generate_unallocated_tensors_with_optimizer( UnallocatedTensors unallocated_tensors = generate_unallocated_tensors( allocated_tensors, tensor_attrs_mapping, gradient_tensor_source); - + if (!get_num_optimizer_tensors(optimizer_attrs)) { return unallocated_tensors; } @@ -163,7 +164,8 @@ UnallocatedTensors generate_unallocated_tensors_with_optimizer( std::unordered_map> optimizer_mapping; - for (std::pair const &tensor_guid_attrs : tensor_attrs_mapping) { + for (std::pair const &tensor_guid_attrs : + tensor_attrs_mapping) { tensor_guid_t tensor_guid = tensor_guid_attrs.first; TensorAttrs tensor_attrs = tensor_guid_attrs.second; if (tensor_attrs.create_gradients == CreateGrad::YES && diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index cb22240b7f..35436a60fd 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -21,12 +21,12 @@ LocalTrainingBacking::LocalTrainingBacking( RuntimeArgConfig const &runtime_arg_config) : computation_graph(computation_graph), task_registry(construct_task_registry(computation_graph)), - local_tensor_backing( - allocated_tensors, - generate_unallocated_tensors(allocated_tensors, - get_all_tensor_attrs(this->computation_graph), - this->gradient_tensor_source), - allocator), + local_tensor_backing(allocated_tensors, + generate_unallocated_tensors( + allocated_tensors, + get_all_tensor_attrs(this->computation_graph), + this->gradient_tensor_source), + allocator), local_args_backing(initialize_args_backing(this->task_registry, this->computation_graph, runtime_arg_config, diff --git a/lib/local-execution/test/src/test_allocated_tensors.cc b/lib/local-execution/test/src/test_allocated_tensors.cc index f4f0664141..99abd538d5 100644 --- a/lib/local-execution/test/src/test_allocated_tensors.cc +++ b/lib/local-execution/test/src/test_allocated_tensors.cc @@ -1,8 +1,8 @@ #include "local-execution/allocated_tensors.h" -#include "local-execution/local_cpu_allocator.h" #include "local-execution/gradient_tensor_source.h" -#include "local-execution/optimizer_tensor_source.h" +#include "local-execution/local_cpu_allocator.h" #include "local-execution/loss_tensor_source.h" +#include "local-execution/optimizer_tensor_source.h" #include "pcg/computation_graph.dtg.h" #include "test/utils/doctest/fmt/pair.h" #include "test/utils/doctest/fmt/unordered_map.h" @@ -24,197 +24,211 @@ TEST_SUITE(FF_TEST_SUITE) { tensor_guid_t mock_tensor_1 = tensor_guid_source.new_mock_tensor_guid(); tensor_guid_t mock_tensor_2 = tensor_guid_source.new_mock_tensor_guid(); - tensor_guid_t mock_tensor_3_with_grad = tensor_guid_source.new_mock_tensor_guid(); + tensor_guid_t mock_tensor_3_with_grad = + tensor_guid_source.new_mock_tensor_guid(); tensor_guid_t dangling_tensor = tensor_guid_source.new_mock_tensor_guid(); TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{ - TensorShape{ - TensorDims{ - FFOrdered{16_n, 10_n} - }, - DataType::FLOAT - }, - std::nullopt, - std::nullopt, - CreateGrad::NO - }; + TensorShape{TensorDims{FFOrdered{16_n, 10_n}}, + DataType::FLOAT}, + std::nullopt, + std::nullopt, + CreateGrad::NO}; TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{ - TensorShape{ - TensorDims{ - FFOrdered{16_n, 20_n} - }, - DataType::FLOAT - }, - std::nullopt, - std::nullopt, - CreateGrad::NO - }; + TensorShape{TensorDims{FFOrdered{16_n, 20_n}}, + DataType::FLOAT}, + std::nullopt, + std::nullopt, + CreateGrad::NO}; TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{ - TensorShape{ - TensorDims{ - FFOrdered{16_n, 30_n} - }, - DataType::FLOAT - }, - std::nullopt, - std::nullopt, - CreateGrad::YES - }; + TensorShape{TensorDims{FFOrdered{16_n, 30_n}}, + DataType::FLOAT}, + std::nullopt, + std::nullopt, + CreateGrad::YES}; - GenericTensorAccessorW tensor_backing_1 = allocator.allocate_tensor(tensor_attrs_1_no_grad.shape); - GenericTensorAccessorW tensor_backing_2 = allocator.allocate_tensor(tensor_attrs_2_no_grad.shape); - GenericTensorAccessorW tensor_backing_3 = allocator.allocate_tensor(tensor_attrs_3_with_grad.shape); + GenericTensorAccessorW tensor_backing_1 = + allocator.allocate_tensor(tensor_attrs_1_no_grad.shape); + GenericTensorAccessorW tensor_backing_2 = + allocator.allocate_tensor(tensor_attrs_2_no_grad.shape); + GenericTensorAccessorW tensor_backing_3 = + allocator.allocate_tensor(tensor_attrs_3_with_grad.shape); std::unordered_map tensor_attrs_mapping = { - {mock_tensor_1, tensor_attrs_1_no_grad}, - {mock_tensor_2, tensor_attrs_2_no_grad}, - {mock_tensor_3_with_grad, tensor_attrs_3_with_grad}, + {mock_tensor_1, tensor_attrs_1_no_grad}, + {mock_tensor_2, tensor_attrs_2_no_grad}, + {mock_tensor_3_with_grad, tensor_attrs_3_with_grad}, }; SUBCASE("Trivial tensors") { SUBCASE("Empty") { AllocatedTensors allocated_tensors = AllocatedTensors{{}, {}, {}}; - bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); - CHECK (result == true); + bool result = are_allocated_tensors_valid(allocated_tensors, + tensor_attrs_mapping); + CHECK(result == true); } - + SUBCASE("Loss tensor") { loss_tensor_t loss_tensor = loss_tensor_source.new_loss_tensor(); - AllocatedTensors allocated_tensors = AllocatedTensors{{ - {TensorTypeVariant{loss_tensor}, tensor_backing_1} - }, {}, {}}; - bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); - CHECK (result == true); + AllocatedTensors allocated_tensors = AllocatedTensors{ + {{TensorTypeVariant{loss_tensor}, tensor_backing_1}}, {}, {}}; + bool result = are_allocated_tensors_valid(allocated_tensors, + tensor_attrs_mapping); + CHECK(result == true); } } SUBCASE("Forward tensors") { SUBCASE("Correct forward tensor") { - AllocatedTensors allocated_tensors = AllocatedTensors{{ - {TensorTypeVariant{mock_tensor_1}, tensor_backing_1} - }, {}, {}}; - bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); - CHECK (result == true); + AllocatedTensors allocated_tensors = AllocatedTensors{ + {{TensorTypeVariant{mock_tensor_1}, tensor_backing_1}}, {}, {}}; + bool result = are_allocated_tensors_valid(allocated_tensors, + tensor_attrs_mapping); + CHECK(result == true); } - + SUBCASE("Incorrect forward tensor") { - AllocatedTensors allocated_tensors = AllocatedTensors{{ - {TensorTypeVariant{mock_tensor_1}, tensor_backing_2} - }, {}, {}}; - bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); - CHECK (result == false); + AllocatedTensors allocated_tensors = AllocatedTensors{ + {{TensorTypeVariant{mock_tensor_1}, tensor_backing_2}}, {}, {}}; + bool result = are_allocated_tensors_valid(allocated_tensors, + tensor_attrs_mapping); + CHECK(result == false); } - + SUBCASE("Dangling tensor guid") { - AllocatedTensors allocated_tensors = AllocatedTensors{{ - {TensorTypeVariant{dangling_tensor}, tensor_backing_1}, - }, {}, {}}; - bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); - CHECK (result == false); + AllocatedTensors allocated_tensors = AllocatedTensors{ + { + {TensorTypeVariant{dangling_tensor}, tensor_backing_1}, + }, + {}, + {}}; + bool result = are_allocated_tensors_valid(allocated_tensors, + tensor_attrs_mapping); + CHECK(result == false); } } - + SUBCASE("Gradient tensors") { - gradient_tensor_t grad_tensor_3 = gradient_tensor_source.new_gradient_tensor(); + gradient_tensor_t grad_tensor_3 = + gradient_tensor_source.new_gradient_tensor(); SUBCASE("Gradient tensor") { - AllocatedTensors allocated_tensors = AllocatedTensors{{ - {TensorTypeVariant{grad_tensor_3}, tensor_backing_3} - }, {{mock_tensor_3_with_grad, grad_tensor_3}}, {}}; - bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); - CHECK (result == true); + AllocatedTensors allocated_tensors = AllocatedTensors{ + {{TensorTypeVariant{grad_tensor_3}, tensor_backing_3}}, + {{mock_tensor_3_with_grad, grad_tensor_3}}, + {}}; + bool result = are_allocated_tensors_valid(allocated_tensors, + tensor_attrs_mapping); + CHECK(result == true); } - + SUBCASE("Dangling gradient tensor") { - AllocatedTensors allocated_tensors = AllocatedTensors{{ - {TensorTypeVariant{grad_tensor_3}, tensor_backing_3} - }, {}, {}}; - bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); - CHECK (result == false); + AllocatedTensors allocated_tensors = AllocatedTensors{ + {{TensorTypeVariant{grad_tensor_3}, tensor_backing_3}}, {}, {}}; + bool result = are_allocated_tensors_valid(allocated_tensors, + tensor_attrs_mapping); + CHECK(result == false); } SUBCASE("Dangling gradient tensor in mapping") { - AllocatedTensors allocated_tensors = AllocatedTensors{{}, { - {mock_tensor_3_with_grad, grad_tensor_3} - }, {}}; - bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); - CHECK (result == false); + AllocatedTensors allocated_tensors = AllocatedTensors{ + {}, {{mock_tensor_3_with_grad, grad_tensor_3}}, {}}; + bool result = are_allocated_tensors_valid(allocated_tensors, + tensor_attrs_mapping); + CHECK(result == false); } - + SUBCASE("Gradient allocated for forward tensor without gradient") { - AllocatedTensors allocated_tensors = AllocatedTensors{{ - {TensorTypeVariant{grad_tensor_3}, tensor_backing_3} - }, {{mock_tensor_2, grad_tensor_3}}, {}}; - bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); - CHECK (result == false); + AllocatedTensors allocated_tensors = AllocatedTensors{ + {{TensorTypeVariant{grad_tensor_3}, tensor_backing_3}}, + {{mock_tensor_2, grad_tensor_3}}, + {}}; + bool result = are_allocated_tensors_valid(allocated_tensors, + tensor_attrs_mapping); + CHECK(result == false); } - + SUBCASE("Gradient tensor with wrong shape") { - AllocatedTensors allocated_tensors = AllocatedTensors{{ - {TensorTypeVariant{grad_tensor_3}, tensor_backing_2} - }, {{mock_tensor_3_with_grad, grad_tensor_3}}, {}}; - bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); - CHECK (result == false); + AllocatedTensors allocated_tensors = AllocatedTensors{ + {{TensorTypeVariant{grad_tensor_3}, tensor_backing_2}}, + {{mock_tensor_3_with_grad, grad_tensor_3}}, + {}}; + bool result = are_allocated_tensors_valid(allocated_tensors, + tensor_attrs_mapping); + CHECK(result == false); } - + SUBCASE("Gradient tensor with dangling tensor guid") { - AllocatedTensors allocated_tensors = AllocatedTensors{{ - {TensorTypeVariant{grad_tensor_3}, tensor_backing_3} - }, {{dangling_tensor, grad_tensor_3}}, {}}; - bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); - CHECK (result == false); + AllocatedTensors allocated_tensors = AllocatedTensors{ + {{TensorTypeVariant{grad_tensor_3}, tensor_backing_3}}, + {{dangling_tensor, grad_tensor_3}}, + {}}; + bool result = are_allocated_tensors_valid(allocated_tensors, + tensor_attrs_mapping); + CHECK(result == false); } } - + SUBCASE("Optimizer tensors") { - optimizer_tensor_t optimizer_tensor_3 = optimizer_tensor_source.new_optimizer_tensor(); + optimizer_tensor_t optimizer_tensor_3 = + optimizer_tensor_source.new_optimizer_tensor(); SUBCASE("Optimizer tensor") { - AllocatedTensors allocated_tensors = AllocatedTensors{{ - {TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3} - }, {}, {{mock_tensor_3_with_grad, {optimizer_tensor_3}}}}; - bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); - CHECK (result == true); + AllocatedTensors allocated_tensors = AllocatedTensors{ + {{TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3}}, + {}, + {{mock_tensor_3_with_grad, {optimizer_tensor_3}}}}; + bool result = are_allocated_tensors_valid(allocated_tensors, + tensor_attrs_mapping); + CHECK(result == true); } - + SUBCASE("Dangling optimizer tensor") { - AllocatedTensors allocated_tensors = AllocatedTensors{{ - {TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3} - }, {}, {}}; - bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); - CHECK (result == false); + AllocatedTensors allocated_tensors = AllocatedTensors{ + {{TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3}}, + {}, + {}}; + bool result = are_allocated_tensors_valid(allocated_tensors, + tensor_attrs_mapping); + CHECK(result == false); } SUBCASE("Dangling optimizer tensor in mapping") { - AllocatedTensors allocated_tensors = AllocatedTensors{{}, {}, { - {mock_tensor_3_with_grad, {optimizer_tensor_3}} - }}; - bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); - CHECK (result == false); + AllocatedTensors allocated_tensors = AllocatedTensors{ + {}, {}, {{mock_tensor_3_with_grad, {optimizer_tensor_3}}}}; + bool result = are_allocated_tensors_valid(allocated_tensors, + tensor_attrs_mapping); + CHECK(result == false); } - + SUBCASE("Optimizer allocated for forward tensor without gradient") { - AllocatedTensors allocated_tensors = AllocatedTensors{{ - {TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3} - }, {}, {{mock_tensor_2, {optimizer_tensor_3}}}}; - bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); - CHECK (result == false); + AllocatedTensors allocated_tensors = AllocatedTensors{ + {{TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3}}, + {}, + {{mock_tensor_2, {optimizer_tensor_3}}}}; + bool result = are_allocated_tensors_valid(allocated_tensors, + tensor_attrs_mapping); + CHECK(result == false); } - + SUBCASE("Optimizer tensor with wrong shape") { - AllocatedTensors allocated_tensors = AllocatedTensors{{ - {TensorTypeVariant{optimizer_tensor_3}, tensor_backing_2} - }, {}, {{mock_tensor_3_with_grad, {optimizer_tensor_3}}}}; - bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); - CHECK (result == false); + AllocatedTensors allocated_tensors = AllocatedTensors{ + {{TensorTypeVariant{optimizer_tensor_3}, tensor_backing_2}}, + {}, + {{mock_tensor_3_with_grad, {optimizer_tensor_3}}}}; + bool result = are_allocated_tensors_valid(allocated_tensors, + tensor_attrs_mapping); + CHECK(result == false); } - + SUBCASE("Optimizer tensor with dangling tensor guid") { - AllocatedTensors allocated_tensors = AllocatedTensors{{ - {TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3} - }, {}, {{dangling_tensor, {optimizer_tensor_3}}}}; - bool result = are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping); - CHECK (result == false); + AllocatedTensors allocated_tensors = AllocatedTensors{ + {{TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3}}, + {}, + {{dangling_tensor, {optimizer_tensor_3}}}}; + bool result = are_allocated_tensors_valid(allocated_tensors, + tensor_attrs_mapping); + CHECK(result == false); } } } diff --git a/lib/local-execution/test/src/test_unallocated_tensors.cc b/lib/local-execution/test/src/test_unallocated_tensors.cc index 65aabc2043..ddad7f4574 100644 --- a/lib/local-execution/test/src/test_unallocated_tensors.cc +++ b/lib/local-execution/test/src/test_unallocated_tensors.cc @@ -1,9 +1,9 @@ #include "local-execution/allocated_tensors.h" -#include "local-execution/local_tensor_backing.h" -#include "local-execution/local_cpu_allocator.h" #include "local-execution/gradient_tensor_source.h" -#include "local-execution/optimizer_tensor_source.h" +#include "local-execution/local_cpu_allocator.h" +#include "local-execution/local_tensor_backing.h" #include "local-execution/loss_tensor_source.h" +#include "local-execution/optimizer_tensor_source.h" #include "pcg/computation_graph.dtg.h" #include "test/utils/doctest/fmt/pair.h" #include "test/utils/doctest/fmt/unordered_map.h" @@ -23,173 +23,208 @@ TEST_SUITE(FF_TEST_SUITE) { tensor_guid_t mock_tensor_1 = tensor_guid_source.new_mock_tensor_guid(); tensor_guid_t mock_tensor_2 = tensor_guid_source.new_mock_tensor_guid(); - tensor_guid_t mock_tensor_3_with_grad = tensor_guid_source.new_mock_tensor_guid(); + tensor_guid_t mock_tensor_3_with_grad = + tensor_guid_source.new_mock_tensor_guid(); TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{ - TensorShape{ - TensorDims{ - FFOrdered{16_n, 10_n} - }, - DataType::FLOAT - }, - std::nullopt, - std::nullopt, - CreateGrad::NO - }; + TensorShape{TensorDims{FFOrdered{16_n, 10_n}}, + DataType::FLOAT}, + std::nullopt, + std::nullopt, + CreateGrad::NO}; TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{ - TensorShape{ - TensorDims{ - FFOrdered{16_n, 20_n} - }, - DataType::FLOAT - }, - std::nullopt, - std::nullopt, - CreateGrad::NO - }; + TensorShape{TensorDims{FFOrdered{16_n, 20_n}}, + DataType::FLOAT}, + std::nullopt, + std::nullopt, + CreateGrad::NO}; TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{ - TensorShape{ - TensorDims{ - FFOrdered{16_n, 30_n} - }, - DataType::FLOAT - }, - std::nullopt, - std::nullopt, - CreateGrad::YES - }; + TensorShape{TensorDims{FFOrdered{16_n, 30_n}}, + DataType::FLOAT}, + std::nullopt, + std::nullopt, + CreateGrad::YES}; - GenericTensorAccessorW tensor_backing_1 = allocator.allocate_tensor(tensor_attrs_1_no_grad.shape); - GenericTensorAccessorW tensor_backing_2 = allocator.allocate_tensor(tensor_attrs_2_no_grad.shape); - GenericTensorAccessorW tensor_backing_3 = allocator.allocate_tensor(tensor_attrs_3_with_grad.shape); + GenericTensorAccessorW tensor_backing_1 = + allocator.allocate_tensor(tensor_attrs_1_no_grad.shape); + GenericTensorAccessorW tensor_backing_2 = + allocator.allocate_tensor(tensor_attrs_2_no_grad.shape); + GenericTensorAccessorW tensor_backing_3 = + allocator.allocate_tensor(tensor_attrs_3_with_grad.shape); std::unordered_map tensor_attrs_mapping = { - {mock_tensor_1, tensor_attrs_1_no_grad}, - {mock_tensor_2, tensor_attrs_2_no_grad}, - {mock_tensor_3_with_grad, tensor_attrs_3_with_grad}, + {mock_tensor_1, tensor_attrs_1_no_grad}, + {mock_tensor_2, tensor_attrs_2_no_grad}, + {mock_tensor_3_with_grad, tensor_attrs_3_with_grad}, }; SUBCASE("Without optimizer") { SUBCASE("AllocatedTensors is empty") { AllocatedTensors empty = AllocatedTensors{{}, {}, {}}; GradientTensorSource gradient_tensor_source; - UnallocatedTensors result = generate_unallocated_tensors(empty, tensor_attrs_mapping, gradient_tensor_source); - + UnallocatedTensors result = generate_unallocated_tensors( + empty, tensor_attrs_mapping, gradient_tensor_source); + GradientTensorSource mock_gradient_tensor_source; - gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor(); - std::unordered_map correct_tensor_type_shapes = { - {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape}, - {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape}, - {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape}, - {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape}, - }; - UnallocatedTensors correct = UnallocatedTensors{ - correct_tensor_type_shapes, - {{mock_tensor_3_with_grad, grad_tensor}}, - {} - }; - CHECK (result == correct); + gradient_tensor_t grad_tensor = + mock_gradient_tensor_source.new_gradient_tensor(); + std::unordered_map + correct_tensor_type_shapes = { + {TensorTypeVariant{mock_tensor_1}, + tensor_attrs_1_no_grad.shape}, + {TensorTypeVariant{mock_tensor_2}, + tensor_attrs_2_no_grad.shape}, + {TensorTypeVariant{mock_tensor_3_with_grad}, + tensor_attrs_3_with_grad.shape}, + {TensorTypeVariant{grad_tensor}, + tensor_attrs_3_with_grad.shape}, + }; + UnallocatedTensors correct = + UnallocatedTensors{correct_tensor_type_shapes, + {{mock_tensor_3_with_grad, grad_tensor}}, + {}}; + CHECK(result == correct); } SUBCASE("AllocatedTensors contains only 1 forward tensor") { - AllocatedTensors allocated_forward_tensors = AllocatedTensors{{ - {TensorTypeVariant{mock_tensor_1}, tensor_backing_1}, - }, {}, {}}; + AllocatedTensors allocated_forward_tensors = AllocatedTensors{ + { + {TensorTypeVariant{mock_tensor_1}, tensor_backing_1}, + }, + {}, + {}}; GradientTensorSource gradient_tensor_source; - UnallocatedTensors result = generate_unallocated_tensors(allocated_forward_tensors, tensor_attrs_mapping, gradient_tensor_source); - + UnallocatedTensors result = + generate_unallocated_tensors(allocated_forward_tensors, + tensor_attrs_mapping, + gradient_tensor_source); + GradientTensorSource mock_gradient_tensor_source; - gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor(); - std::unordered_map correct_tensor_type_shapes = { - {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape}, - {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape}, - {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape}, - }; - UnallocatedTensors correct = UnallocatedTensors{ - correct_tensor_type_shapes, - {{mock_tensor_3_with_grad, grad_tensor}}, - {} - }; - CHECK (result == correct); + gradient_tensor_t grad_tensor = + mock_gradient_tensor_source.new_gradient_tensor(); + std::unordered_map + correct_tensor_type_shapes = { + {TensorTypeVariant{mock_tensor_2}, + tensor_attrs_2_no_grad.shape}, + {TensorTypeVariant{mock_tensor_3_with_grad}, + tensor_attrs_3_with_grad.shape}, + {TensorTypeVariant{grad_tensor}, + tensor_attrs_3_with_grad.shape}, + }; + UnallocatedTensors correct = + UnallocatedTensors{correct_tensor_type_shapes, + {{mock_tensor_3_with_grad, grad_tensor}}, + {}}; + CHECK(result == correct); } SUBCASE("AllocatedTensors contains only forward tensors") { - AllocatedTensors allocated_forward_tensors = AllocatedTensors{{ - {TensorTypeVariant{mock_tensor_1}, tensor_backing_1}, - {TensorTypeVariant{mock_tensor_2}, tensor_backing_2}, - {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_backing_3}, - }, {}, {}}; + AllocatedTensors allocated_forward_tensors = AllocatedTensors{ + { + {TensorTypeVariant{mock_tensor_1}, tensor_backing_1}, + {TensorTypeVariant{mock_tensor_2}, tensor_backing_2}, + {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_backing_3}, + }, + {}, + {}}; GradientTensorSource gradient_tensor_source; - UnallocatedTensors result = generate_unallocated_tensors(allocated_forward_tensors, tensor_attrs_mapping, gradient_tensor_source); - + UnallocatedTensors result = + generate_unallocated_tensors(allocated_forward_tensors, + tensor_attrs_mapping, + gradient_tensor_source); + GradientTensorSource mock_gradient_tensor_source; - gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor(); - std::unordered_map correct_tensor_type_shapes = { - {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape}, - }; - UnallocatedTensors correct = UnallocatedTensors{ - correct_tensor_type_shapes, - {{mock_tensor_3_with_grad, grad_tensor}}, - {} - }; - CHECK (result == correct); + gradient_tensor_t grad_tensor = + mock_gradient_tensor_source.new_gradient_tensor(); + std::unordered_map + correct_tensor_type_shapes = { + {TensorTypeVariant{grad_tensor}, + tensor_attrs_3_with_grad.shape}, + }; + UnallocatedTensors correct = + UnallocatedTensors{correct_tensor_type_shapes, + {{mock_tensor_3_with_grad, grad_tensor}}, + {}}; + CHECK(result == correct); } SUBCASE("AllocatedTensors contains only gradient tensor") { GradientTensorSource gradient_tensor_source; - gradient_tensor_t grad_tensor = gradient_tensor_source.new_gradient_tensor(); - AllocatedTensors allocated_forward_tensors = AllocatedTensors{{ - {TensorTypeVariant{grad_tensor}, tensor_backing_3}, - }, {{mock_tensor_3_with_grad, grad_tensor}}, {}}; - UnallocatedTensors result = generate_unallocated_tensors(allocated_forward_tensors, tensor_attrs_mapping, gradient_tensor_source); - - std::unordered_map correct_tensor_type_shapes = { - {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape}, - {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape}, - {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape}, - }; - UnallocatedTensors correct = UnallocatedTensors{ - correct_tensor_type_shapes, - {}, - {} - }; - CHECK (result == correct); + gradient_tensor_t grad_tensor = + gradient_tensor_source.new_gradient_tensor(); + AllocatedTensors allocated_forward_tensors = AllocatedTensors{ + { + {TensorTypeVariant{grad_tensor}, tensor_backing_3}, + }, + {{mock_tensor_3_with_grad, grad_tensor}}, + {}}; + UnallocatedTensors result = + generate_unallocated_tensors(allocated_forward_tensors, + tensor_attrs_mapping, + gradient_tensor_source); + + std::unordered_map + correct_tensor_type_shapes = { + {TensorTypeVariant{mock_tensor_1}, + tensor_attrs_1_no_grad.shape}, + {TensorTypeVariant{mock_tensor_2}, + tensor_attrs_2_no_grad.shape}, + {TensorTypeVariant{mock_tensor_3_with_grad}, + tensor_attrs_3_with_grad.shape}, + }; + UnallocatedTensors correct = + UnallocatedTensors{correct_tensor_type_shapes, {}, {}}; + CHECK(result == correct); } SUBCASE("AllocatedTensors contains mixture") { GradientTensorSource gradient_tensor_source; - gradient_tensor_t grad_tensor = gradient_tensor_source.new_gradient_tensor(); - AllocatedTensors allocated_forward_tensors = AllocatedTensors{{ - {TensorTypeVariant{mock_tensor_1}, tensor_backing_1}, - {TensorTypeVariant{grad_tensor}, tensor_backing_3}, - }, {{mock_tensor_3_with_grad, grad_tensor}}, {}}; - UnallocatedTensors result = generate_unallocated_tensors(allocated_forward_tensors, tensor_attrs_mapping, gradient_tensor_source); - - std::unordered_map correct_tensor_type_shapes = { - {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape}, - {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape}, - }; - UnallocatedTensors correct = UnallocatedTensors{ - correct_tensor_type_shapes, - {}, - {} - }; - CHECK (result == correct); + gradient_tensor_t grad_tensor = + gradient_tensor_source.new_gradient_tensor(); + AllocatedTensors allocated_forward_tensors = AllocatedTensors{ + { + {TensorTypeVariant{mock_tensor_1}, tensor_backing_1}, + {TensorTypeVariant{grad_tensor}, tensor_backing_3}, + }, + {{mock_tensor_3_with_grad, grad_tensor}}, + {}}; + UnallocatedTensors result = + generate_unallocated_tensors(allocated_forward_tensors, + tensor_attrs_mapping, + gradient_tensor_source); + + std::unordered_map + correct_tensor_type_shapes = { + {TensorTypeVariant{mock_tensor_2}, + tensor_attrs_2_no_grad.shape}, + {TensorTypeVariant{mock_tensor_3_with_grad}, + tensor_attrs_3_with_grad.shape}, + }; + UnallocatedTensors correct = + UnallocatedTensors{correct_tensor_type_shapes, {}, {}}; + CHECK(result == correct); } SUBCASE("Fully AllocatedTensors") { GradientTensorSource gradient_tensor_source; - gradient_tensor_t grad_tensor = gradient_tensor_source.new_gradient_tensor(); - AllocatedTensors allocated_forward_tensors = AllocatedTensors{{ - {TensorTypeVariant{mock_tensor_1}, tensor_backing_1}, - {TensorTypeVariant{mock_tensor_2}, tensor_backing_2}, - {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_backing_3}, - {TensorTypeVariant{grad_tensor}, tensor_backing_3}, - }, {{mock_tensor_3_with_grad, grad_tensor}}, {}}; - UnallocatedTensors result = generate_unallocated_tensors(allocated_forward_tensors, tensor_attrs_mapping, gradient_tensor_source); - + gradient_tensor_t grad_tensor = + gradient_tensor_source.new_gradient_tensor(); + AllocatedTensors allocated_forward_tensors = AllocatedTensors{ + { + {TensorTypeVariant{mock_tensor_1}, tensor_backing_1}, + {TensorTypeVariant{mock_tensor_2}, tensor_backing_2}, + {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_backing_3}, + {TensorTypeVariant{grad_tensor}, tensor_backing_3}, + }, + {{mock_tensor_3_with_grad, grad_tensor}}, + {}}; + UnallocatedTensors result = + generate_unallocated_tensors(allocated_forward_tensors, + tensor_attrs_mapping, + gradient_tensor_source); + UnallocatedTensors correct = UnallocatedTensors{{}, {}, {}}; - CHECK (result == correct); + CHECK(result == correct); } } @@ -197,182 +232,252 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("SGD Attrs") { SUBCASE("without momentum") { double momentum = 0.0; - OptimizerAttrs attrs = OptimizerAttrs{SGDOptimizerAttrs{0.0, momentum, false, 0.0}}; + OptimizerAttrs attrs = + OptimizerAttrs{SGDOptimizerAttrs{0.0, momentum, false, 0.0}}; AllocatedTensors empty = AllocatedTensors{{}, {}, {}}; GradientTensorSource gradient_tensor_source; OptimizerTensorSource optimizer_tensour_source; - UnallocatedTensors result = generate_unallocated_tensors_with_optimizer(empty, - tensor_attrs_mapping, gradient_tensor_source, optimizer_tensor_source, attrs); - + UnallocatedTensors result = + generate_unallocated_tensors_with_optimizer( + empty, + tensor_attrs_mapping, + gradient_tensor_source, + optimizer_tensor_source, + attrs); + GradientTensorSource mock_gradient_tensor_source; - UnallocatedTensors correct = generate_unallocated_tensors(empty, tensor_attrs_mapping, mock_gradient_tensor_source); - CHECK (result == correct); + UnallocatedTensors correct = generate_unallocated_tensors( + empty, tensor_attrs_mapping, mock_gradient_tensor_source); + CHECK(result == correct); } SUBCASE("with momentum") { double momentum = 0.9; - OptimizerAttrs attrs = OptimizerAttrs{SGDOptimizerAttrs{0.0, momentum, false, 0.0}}; + OptimizerAttrs attrs = + OptimizerAttrs{SGDOptimizerAttrs{0.0, momentum, false, 0.0}}; SUBCASE("unallocated") { AllocatedTensors empty = AllocatedTensors{{}, {}, {}}; GradientTensorSource gradient_tensor_source; OptimizerTensorSource optimizer_tensour_source; - UnallocatedTensors result = generate_unallocated_tensors_with_optimizer(empty, - tensor_attrs_mapping, gradient_tensor_source, optimizer_tensor_source, attrs); - + UnallocatedTensors result = + generate_unallocated_tensors_with_optimizer( + empty, + tensor_attrs_mapping, + gradient_tensor_source, + optimizer_tensor_source, + attrs); + GradientTensorSource mock_gradient_tensor_source; - gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor(); + gradient_tensor_t grad_tensor = + mock_gradient_tensor_source.new_gradient_tensor(); OptimizerTensorSource mock_optimizer_tensour_source; - optimizer_tensor_t optimizer_tensor = mock_optimizer_tensour_source.new_optimizer_tensor(); - - std::unordered_map correct_tensor_type_shapes = { - {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape}, - {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape}, - {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape}, - {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape}, - {TensorTypeVariant{optimizer_tensor}, tensor_attrs_3_with_grad.shape}, - }; + optimizer_tensor_t optimizer_tensor = + mock_optimizer_tensour_source.new_optimizer_tensor(); + + std::unordered_map + correct_tensor_type_shapes = { + {TensorTypeVariant{mock_tensor_1}, + tensor_attrs_1_no_grad.shape}, + {TensorTypeVariant{mock_tensor_2}, + tensor_attrs_2_no_grad.shape}, + {TensorTypeVariant{mock_tensor_3_with_grad}, + tensor_attrs_3_with_grad.shape}, + {TensorTypeVariant{grad_tensor}, + tensor_attrs_3_with_grad.shape}, + {TensorTypeVariant{optimizer_tensor}, + tensor_attrs_3_with_grad.shape}, + }; UnallocatedTensors correct = UnallocatedTensors{ - correct_tensor_type_shapes, - {{mock_tensor_3_with_grad, grad_tensor}}, - {{mock_tensor_3_with_grad, {optimizer_tensor}}} - }; + correct_tensor_type_shapes, + {{mock_tensor_3_with_grad, grad_tensor}}, + {{mock_tensor_3_with_grad, {optimizer_tensor}}}}; - CHECK (result == correct); + CHECK(result == correct); } SUBCASE("allocated") { OptimizerTensorSource optimizer_tensour_source; - optimizer_tensor_t optimizer_tensor = optimizer_tensour_source.new_optimizer_tensor(); - AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{{ - {TensorTypeVariant{optimizer_tensor}, tensor_backing_3} - }, {}, { - {mock_tensor_3_with_grad, {optimizer_tensor}} - }}; + optimizer_tensor_t optimizer_tensor = + optimizer_tensour_source.new_optimizer_tensor(); + AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{ + {{TensorTypeVariant{optimizer_tensor}, tensor_backing_3}}, + {}, + {{mock_tensor_3_with_grad, {optimizer_tensor}}}}; GradientTensorSource gradient_tensor_source; - UnallocatedTensors result = generate_unallocated_tensors_with_optimizer(allocated_optimizer_tensor, - tensor_attrs_mapping, gradient_tensor_source, optimizer_tensor_source, attrs); - + UnallocatedTensors result = + generate_unallocated_tensors_with_optimizer( + allocated_optimizer_tensor, + tensor_attrs_mapping, + gradient_tensor_source, + optimizer_tensor_source, + attrs); + GradientTensorSource mock_gradient_tensor_source; - gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor(); - - std::unordered_map correct_tensor_type_shapes = { - {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape}, - {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape}, - {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape}, - {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape}, - }; - UnallocatedTensors correct = UnallocatedTensors{ - correct_tensor_type_shapes, - {{mock_tensor_3_with_grad, grad_tensor}}, - {} - }; + gradient_tensor_t grad_tensor = + mock_gradient_tensor_source.new_gradient_tensor(); + + std::unordered_map + correct_tensor_type_shapes = { + {TensorTypeVariant{mock_tensor_1}, + tensor_attrs_1_no_grad.shape}, + {TensorTypeVariant{mock_tensor_2}, + tensor_attrs_2_no_grad.shape}, + {TensorTypeVariant{mock_tensor_3_with_grad}, + tensor_attrs_3_with_grad.shape}, + {TensorTypeVariant{grad_tensor}, + tensor_attrs_3_with_grad.shape}, + }; + UnallocatedTensors correct = + UnallocatedTensors{correct_tensor_type_shapes, + {{mock_tensor_3_with_grad, grad_tensor}}, + {}}; - CHECK (result == correct); + CHECK(result == correct); } } } SUBCASE("Adam Attrs") { - OptimizerAttrs attrs = OptimizerAttrs{AdamOptimizerAttrs{/*alpha=*/0.001, - /*beta1=*/0.9, - /*beta2=*/0.999, - /*weight_decay=*/0.001, - /*alpha_t=*/0.001, - /*beta_t=*/0.9, - /*beta2_t=*/0.999, - /*epsilon=*/1e-8}}; + OptimizerAttrs attrs = + OptimizerAttrs{AdamOptimizerAttrs{/*alpha=*/0.001, + /*beta1=*/0.9, + /*beta2=*/0.999, + /*weight_decay=*/0.001, + /*alpha_t=*/0.001, + /*beta_t=*/0.9, + /*beta2_t=*/0.999, + /*epsilon=*/1e-8}}; SUBCASE("Empty") { AllocatedTensors empty = AllocatedTensors{{}, {}, {}}; GradientTensorSource gradient_tensor_source; OptimizerTensorSource optimizer_tensour_source; - UnallocatedTensors result = generate_unallocated_tensors_with_optimizer(empty, - tensor_attrs_mapping, gradient_tensor_source, optimizer_tensor_source, attrs); - + UnallocatedTensors result = + generate_unallocated_tensors_with_optimizer( + empty, + tensor_attrs_mapping, + gradient_tensor_source, + optimizer_tensor_source, + attrs); + GradientTensorSource mock_gradient_tensor_source; - gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor(); + gradient_tensor_t grad_tensor = + mock_gradient_tensor_source.new_gradient_tensor(); OptimizerTensorSource mock_optimizer_tensour_source; - optimizer_tensor_t optimizer_tensor_1 = mock_optimizer_tensour_source.new_optimizer_tensor(); - optimizer_tensor_t optimizer_tensor_2 = mock_optimizer_tensour_source.new_optimizer_tensor(); - - std::unordered_map correct_tensor_type_shapes = { - {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape}, - {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape}, - {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape}, - {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape}, - {TensorTypeVariant{optimizer_tensor_1}, tensor_attrs_3_with_grad.shape}, - {TensorTypeVariant{optimizer_tensor_2}, tensor_attrs_3_with_grad.shape}, - }; - UnallocatedTensors correct = UnallocatedTensors{ - correct_tensor_type_shapes, - {{mock_tensor_3_with_grad, grad_tensor}}, - {{mock_tensor_3_with_grad, {optimizer_tensor_1, optimizer_tensor_2}}} - }; + optimizer_tensor_t optimizer_tensor_1 = + mock_optimizer_tensour_source.new_optimizer_tensor(); + optimizer_tensor_t optimizer_tensor_2 = + mock_optimizer_tensour_source.new_optimizer_tensor(); + + std::unordered_map + correct_tensor_type_shapes = { + {TensorTypeVariant{mock_tensor_1}, + tensor_attrs_1_no_grad.shape}, + {TensorTypeVariant{mock_tensor_2}, + tensor_attrs_2_no_grad.shape}, + {TensorTypeVariant{mock_tensor_3_with_grad}, + tensor_attrs_3_with_grad.shape}, + {TensorTypeVariant{grad_tensor}, + tensor_attrs_3_with_grad.shape}, + {TensorTypeVariant{optimizer_tensor_1}, + tensor_attrs_3_with_grad.shape}, + {TensorTypeVariant{optimizer_tensor_2}, + tensor_attrs_3_with_grad.shape}, + }; + UnallocatedTensors correct = + UnallocatedTensors{correct_tensor_type_shapes, + {{mock_tensor_3_with_grad, grad_tensor}}, + {{mock_tensor_3_with_grad, + {optimizer_tensor_1, optimizer_tensor_2}}}}; - CHECK (result == correct); + CHECK(result == correct); } SUBCASE("Partially allocated") { OptimizerTensorSource optimizer_tensour_source; - optimizer_tensor_t optimizer_tensor_1 = optimizer_tensour_source.new_optimizer_tensor(); - AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{{ - {TensorTypeVariant{optimizer_tensor_1}, tensor_backing_3} - }, {}, { - {mock_tensor_3_with_grad, {optimizer_tensor_1}} - }}; + optimizer_tensor_t optimizer_tensor_1 = + optimizer_tensour_source.new_optimizer_tensor(); + AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{ + {{TensorTypeVariant{optimizer_tensor_1}, tensor_backing_3}}, + {}, + {{mock_tensor_3_with_grad, {optimizer_tensor_1}}}}; GradientTensorSource gradient_tensor_source; - UnallocatedTensors result = generate_unallocated_tensors_with_optimizer(allocated_optimizer_tensor, - tensor_attrs_mapping, gradient_tensor_source, optimizer_tensor_source, attrs); - + UnallocatedTensors result = + generate_unallocated_tensors_with_optimizer( + allocated_optimizer_tensor, + tensor_attrs_mapping, + gradient_tensor_source, + optimizer_tensor_source, + attrs); + GradientTensorSource mock_gradient_tensor_source; - gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor(); + gradient_tensor_t grad_tensor = + mock_gradient_tensor_source.new_gradient_tensor(); OptimizerTensorSource mock_optimizer_tensour_source; optimizer_tensor_source.new_optimizer_tensor(); - optimizer_tensor_t optimizer_tensor_2 = optimizer_tensour_source.new_optimizer_tensor(); - - std::unordered_map correct_tensor_type_shapes = { - {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape}, - {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape}, - {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape}, - {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape}, - {TensorTypeVariant{optimizer_tensor_2}, tensor_attrs_3_with_grad.shape}, - }; + optimizer_tensor_t optimizer_tensor_2 = + optimizer_tensour_source.new_optimizer_tensor(); + + std::unordered_map + correct_tensor_type_shapes = { + {TensorTypeVariant{mock_tensor_1}, + tensor_attrs_1_no_grad.shape}, + {TensorTypeVariant{mock_tensor_2}, + tensor_attrs_2_no_grad.shape}, + {TensorTypeVariant{mock_tensor_3_with_grad}, + tensor_attrs_3_with_grad.shape}, + {TensorTypeVariant{grad_tensor}, + tensor_attrs_3_with_grad.shape}, + {TensorTypeVariant{optimizer_tensor_2}, + tensor_attrs_3_with_grad.shape}, + }; UnallocatedTensors correct = UnallocatedTensors{ - correct_tensor_type_shapes, - {{mock_tensor_3_with_grad, grad_tensor}}, - {{mock_tensor_3_with_grad, {optimizer_tensor_2}}} - }; + correct_tensor_type_shapes, + {{mock_tensor_3_with_grad, grad_tensor}}, + {{mock_tensor_3_with_grad, {optimizer_tensor_2}}}}; - CHECK (result == correct); + CHECK(result == correct); } SUBCASE("Fully allocated") { OptimizerTensorSource optimizer_tensour_source; - optimizer_tensor_t optimizer_tensor_1 = optimizer_tensour_source.new_optimizer_tensor(); - optimizer_tensor_t optimizer_tensor_2 = optimizer_tensour_source.new_optimizer_tensor(); - AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{{ - {TensorTypeVariant{optimizer_tensor_1}, tensor_backing_3}, - {TensorTypeVariant{optimizer_tensor_2}, tensor_backing_3} - }, {}, { - {mock_tensor_3_with_grad, {optimizer_tensor_1, optimizer_tensor_2}} - }}; + optimizer_tensor_t optimizer_tensor_1 = + optimizer_tensour_source.new_optimizer_tensor(); + optimizer_tensor_t optimizer_tensor_2 = + optimizer_tensour_source.new_optimizer_tensor(); + AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{ + {{TensorTypeVariant{optimizer_tensor_1}, tensor_backing_3}, + {TensorTypeVariant{optimizer_tensor_2}, tensor_backing_3}}, + {}, + {{mock_tensor_3_with_grad, + {optimizer_tensor_1, optimizer_tensor_2}}}}; GradientTensorSource gradient_tensor_source; - UnallocatedTensors result = generate_unallocated_tensors_with_optimizer(allocated_optimizer_tensor, - tensor_attrs_mapping, gradient_tensor_source, optimizer_tensor_source, attrs); - + UnallocatedTensors result = + generate_unallocated_tensors_with_optimizer( + allocated_optimizer_tensor, + tensor_attrs_mapping, + gradient_tensor_source, + optimizer_tensor_source, + attrs); + GradientTensorSource mock_gradient_tensor_source; - gradient_tensor_t grad_tensor = mock_gradient_tensor_source.new_gradient_tensor(); - - std::unordered_map correct_tensor_type_shapes = { - {TensorTypeVariant{mock_tensor_1}, tensor_attrs_1_no_grad.shape}, - {TensorTypeVariant{mock_tensor_2}, tensor_attrs_2_no_grad.shape}, - {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_attrs_3_with_grad.shape}, - {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape}, - }; - UnallocatedTensors correct = UnallocatedTensors{ - correct_tensor_type_shapes, - {{mock_tensor_3_with_grad, grad_tensor}}, - {} - }; + gradient_tensor_t grad_tensor = + mock_gradient_tensor_source.new_gradient_tensor(); + + std::unordered_map + correct_tensor_type_shapes = { + {TensorTypeVariant{mock_tensor_1}, + tensor_attrs_1_no_grad.shape}, + {TensorTypeVariant{mock_tensor_2}, + tensor_attrs_2_no_grad.shape}, + {TensorTypeVariant{mock_tensor_3_with_grad}, + tensor_attrs_3_with_grad.shape}, + {TensorTypeVariant{grad_tensor}, + tensor_attrs_3_with_grad.shape}, + }; + UnallocatedTensors correct = + UnallocatedTensors{correct_tensor_type_shapes, + {{mock_tensor_3_with_grad, grad_tensor}}, + {}}; - CHECK (result == correct); + CHECK(result == correct); } } } From b532c5023861ea8f0391c0aef4dc86e42cda0d22 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Thu, 13 Feb 2025 13:19:25 -0800 Subject: [PATCH 44/91] Pass allocated-unallocated tests --- lib/kernels/src/legion_dim.cc | 6 + .../local-execution/gradient_tensor_source.h | 2 + .../local-execution/optimizer_tensor_source.h | 2 + .../src/gradient_tensor_source.cc | 4 + .../src/local_tensor_backing.cc | 11 +- .../src/local_training_backing.cc | 3 +- .../src/optimizer_tensor_source.cc | 4 + .../test/src/test_unallocated_tensors.cc | 128 +++++++----------- lib/pcg/src/pcg/computation_graph.cc | 2 +- 9 files changed, 73 insertions(+), 89 deletions(-) diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/legion_dim.cc index 23875ad916..49b028f227 100644 --- a/lib/kernels/src/legion_dim.cc +++ b/lib/kernels/src/legion_dim.cc @@ -19,4 +19,10 @@ ff_dim_t legion_dim_from_ff_dim(legion_dim_t legion_dim, legion_dim.value.unwrap_nonnegative() - 1}}; } +ff_dim_t ff_dim_from_legion_dim(legion_dim_t legion_dim, + nonnegative_int num_dimensions) { + return ff_dim_t{nonnegative_int{num_dimensions.unwrap_nonnegative() - + legion_dim.value.unwrap_nonnegative() - 1}}; +} + } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/gradient_tensor_source.h b/lib/local-execution/include/local-execution/gradient_tensor_source.h index e7d24d1ca5..d724859712 100644 --- a/lib/local-execution/include/local-execution/gradient_tensor_source.h +++ b/lib/local-execution/include/local-execution/gradient_tensor_source.h @@ -11,6 +11,8 @@ struct GradientTensorSource { gradient_tensor_t new_gradient_tensor(); + void reset(); + private: static size_t next_available_gradient_tensor_id; }; diff --git a/lib/local-execution/include/local-execution/optimizer_tensor_source.h b/lib/local-execution/include/local-execution/optimizer_tensor_source.h index 7a5057c84a..b2b3d94ba5 100644 --- a/lib/local-execution/include/local-execution/optimizer_tensor_source.h +++ b/lib/local-execution/include/local-execution/optimizer_tensor_source.h @@ -11,6 +11,8 @@ struct OptimizerTensorSource { optimizer_tensor_t new_optimizer_tensor(); + void reset(); + private: static size_t next_available_optimizer_tensor_id; }; diff --git a/lib/local-execution/src/gradient_tensor_source.cc b/lib/local-execution/src/gradient_tensor_source.cc index 28cec16ef9..7dcb947e89 100644 --- a/lib/local-execution/src/gradient_tensor_source.cc +++ b/lib/local-execution/src/gradient_tensor_source.cc @@ -11,4 +11,8 @@ gradient_tensor_t GradientTensorSource::new_gradient_tensor() { GradientTensorSource::next_available_gradient_tensor_id++}; } +void GradientTensorSource::reset() { + GradientTensorSource::next_available_gradient_tensor_id = 0; +} + } // namespace FlexFlow diff --git a/lib/local-execution/src/local_tensor_backing.cc b/lib/local-execution/src/local_tensor_backing.cc index be84d77906..b5a0deaee4 100644 --- a/lib/local-execution/src/local_tensor_backing.cc +++ b/lib/local-execution/src/local_tensor_backing.cc @@ -135,7 +135,7 @@ UnallocatedTensors generate_unallocated_tensors( gradient_tensor_t gradient_tensor = gradient_tensor_source.new_gradient_tensor(); tensor_type_shapes.insert( - {TensorTypeVariant{tensor_guid}, tensor_attrs.shape}); + {TensorTypeVariant{gradient_tensor}, tensor_attrs.shape}); gradient_mapping.insert({tensor_guid, gradient_tensor}); } } @@ -168,8 +168,7 @@ UnallocatedTensors generate_unallocated_tensors_with_optimizer( tensor_attrs_mapping) { tensor_guid_t tensor_guid = tensor_guid_attrs.first; TensorAttrs tensor_attrs = tensor_guid_attrs.second; - if (tensor_attrs.create_gradients == CreateGrad::YES && - !allocated_tensors.optimizer_mapping.count(tensor_guid)) { + if (tensor_attrs.create_gradients == CreateGrad::YES) { std::vector optimizer_tensors; int num_optimizer_tensors_to_allocate = @@ -178,6 +177,7 @@ UnallocatedTensors generate_unallocated_tensors_with_optimizer( num_optimizer_tensors_to_allocate -= allocated_tensors.optimizer_mapping.at(tensor_guid).size(); } + std::cout << num_optimizer_tensors_to_allocate; for (int i = 0; i < num_optimizer_tensors_to_allocate; ++i) { optimizer_tensor_t optimizer_tensor = @@ -186,7 +186,10 @@ UnallocatedTensors generate_unallocated_tensors_with_optimizer( tensor_type_shapes.insert( {TensorTypeVariant{optimizer_tensor}, tensor_attrs.shape}); } - optimizer_mapping.insert({tensor_guid, optimizer_tensors}); + + if (num_optimizer_tensors_to_allocate > 0) { + optimizer_mapping.insert({tensor_guid, optimizer_tensors}); + } } } diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index 35436a60fd..23db484d0b 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -101,8 +101,7 @@ std::optional call_task_impl(TaskRegistry const &task_registry, std::optional execute_forward(LocalTrainingBacking const &local_training_backing, - layer_guid_t const &operator_node, - Allocator &allocator) { + layer_guid_t const &operator_node) { if (registry_contains_task_for_layer(local_training_backing.task_registry, operator_node, OpTaskType::FWD)) { diff --git a/lib/local-execution/src/optimizer_tensor_source.cc b/lib/local-execution/src/optimizer_tensor_source.cc index c241c7f4bd..a1a9a2927d 100644 --- a/lib/local-execution/src/optimizer_tensor_source.cc +++ b/lib/local-execution/src/optimizer_tensor_source.cc @@ -11,4 +11,8 @@ optimizer_tensor_t OptimizerTensorSource::new_optimizer_tensor() { OptimizerTensorSource::next_available_optimizer_tensor_id++}; } +void OptimizerTensorSource::reset() { + OptimizerTensorSource::next_available_optimizer_tensor_id = 0; +} + } // namespace FlexFlow diff --git a/lib/local-execution/test/src/test_unallocated_tensors.cc b/lib/local-execution/test/src/test_unallocated_tensors.cc index ddad7f4574..00f4c1c27c 100644 --- a/lib/local-execution/test/src/test_unallocated_tensors.cc +++ b/lib/local-execution/test/src/test_unallocated_tensors.cc @@ -17,8 +17,12 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("UnallocatedTensors") { MockTensorGuidSource tensor_guid_source; + GradientTensorSource gradient_tensor_source; OptimizerTensorSource optimizer_tensor_source; + gradient_tensor_source.reset(); + optimizer_tensor_source.reset(); + Allocator allocator = create_local_cpu_memory_allocator(); tensor_guid_t mock_tensor_1 = tensor_guid_source.new_mock_tensor_guid(); @@ -26,6 +30,13 @@ TEST_SUITE(FF_TEST_SUITE) { tensor_guid_t mock_tensor_3_with_grad = tensor_guid_source.new_mock_tensor_guid(); + gradient_tensor_t grad_tensor = + gradient_tensor_source.new_gradient_tensor(); + optimizer_tensor_t optimizer_tensor_1 = + optimizer_tensor_source.new_optimizer_tensor(); + optimizer_tensor_t optimizer_tensor_2 = + optimizer_tensor_source.new_optimizer_tensor(); + TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{ TensorShape{TensorDims{FFOrdered{16_n, 10_n}}, DataType::FLOAT}, @@ -61,13 +72,10 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("Without optimizer") { SUBCASE("AllocatedTensors is empty") { AllocatedTensors empty = AllocatedTensors{{}, {}, {}}; - GradientTensorSource gradient_tensor_source; + gradient_tensor_source.reset(); UnallocatedTensors result = generate_unallocated_tensors( empty, tensor_attrs_mapping, gradient_tensor_source); - GradientTensorSource mock_gradient_tensor_source; - gradient_tensor_t grad_tensor = - mock_gradient_tensor_source.new_gradient_tensor(); std::unordered_map correct_tensor_type_shapes = { {TensorTypeVariant{mock_tensor_1}, @@ -93,15 +101,12 @@ TEST_SUITE(FF_TEST_SUITE) { }, {}, {}}; - GradientTensorSource gradient_tensor_source; + + gradient_tensor_source.reset(); UnallocatedTensors result = generate_unallocated_tensors(allocated_forward_tensors, tensor_attrs_mapping, gradient_tensor_source); - - GradientTensorSource mock_gradient_tensor_source; - gradient_tensor_t grad_tensor = - mock_gradient_tensor_source.new_gradient_tensor(); std::unordered_map correct_tensor_type_shapes = { {TensorTypeVariant{mock_tensor_2}, @@ -127,15 +132,13 @@ TEST_SUITE(FF_TEST_SUITE) { }, {}, {}}; - GradientTensorSource gradient_tensor_source; + + gradient_tensor_source.reset(); UnallocatedTensors result = generate_unallocated_tensors(allocated_forward_tensors, tensor_attrs_mapping, gradient_tensor_source); - GradientTensorSource mock_gradient_tensor_source; - gradient_tensor_t grad_tensor = - mock_gradient_tensor_source.new_gradient_tensor(); std::unordered_map correct_tensor_type_shapes = { {TensorTypeVariant{grad_tensor}, @@ -149,9 +152,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("AllocatedTensors contains only gradient tensor") { - GradientTensorSource gradient_tensor_source; - gradient_tensor_t grad_tensor = - gradient_tensor_source.new_gradient_tensor(); + AllocatedTensors allocated_forward_tensors = AllocatedTensors{ { {TensorTypeVariant{grad_tensor}, tensor_backing_3}, @@ -178,9 +179,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("AllocatedTensors contains mixture") { - GradientTensorSource gradient_tensor_source; - gradient_tensor_t grad_tensor = - gradient_tensor_source.new_gradient_tensor(); + AllocatedTensors allocated_forward_tensors = AllocatedTensors{ { {TensorTypeVariant{mock_tensor_1}, tensor_backing_1}, @@ -206,9 +205,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("Fully AllocatedTensors") { - GradientTensorSource gradient_tensor_source; - gradient_tensor_t grad_tensor = - gradient_tensor_source.new_gradient_tensor(); + AllocatedTensors allocated_forward_tensors = AllocatedTensors{ { {TensorTypeVariant{mock_tensor_1}, tensor_backing_1}, @@ -235,8 +232,8 @@ TEST_SUITE(FF_TEST_SUITE) { OptimizerAttrs attrs = OptimizerAttrs{SGDOptimizerAttrs{0.0, momentum, false, 0.0}}; AllocatedTensors empty = AllocatedTensors{{}, {}, {}}; - GradientTensorSource gradient_tensor_source; - OptimizerTensorSource optimizer_tensour_source; + + gradient_tensor_source.reset(); UnallocatedTensors result = generate_unallocated_tensors_with_optimizer( empty, @@ -245,9 +242,9 @@ TEST_SUITE(FF_TEST_SUITE) { optimizer_tensor_source, attrs); - GradientTensorSource mock_gradient_tensor_source; + gradient_tensor_source.reset(); UnallocatedTensors correct = generate_unallocated_tensors( - empty, tensor_attrs_mapping, mock_gradient_tensor_source); + empty, tensor_attrs_mapping, gradient_tensor_source); CHECK(result == correct); } SUBCASE("with momentum") { @@ -257,8 +254,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("unallocated") { AllocatedTensors empty = AllocatedTensors{{}, {}, {}}; - GradientTensorSource gradient_tensor_source; - OptimizerTensorSource optimizer_tensour_source; + + gradient_tensor_source.reset(); + optimizer_tensor_source.reset(); UnallocatedTensors result = generate_unallocated_tensors_with_optimizer( empty, @@ -267,13 +265,6 @@ TEST_SUITE(FF_TEST_SUITE) { optimizer_tensor_source, attrs); - GradientTensorSource mock_gradient_tensor_source; - gradient_tensor_t grad_tensor = - mock_gradient_tensor_source.new_gradient_tensor(); - OptimizerTensorSource mock_optimizer_tensour_source; - optimizer_tensor_t optimizer_tensor = - mock_optimizer_tensour_source.new_optimizer_tensor(); - std::unordered_map correct_tensor_type_shapes = { {TensorTypeVariant{mock_tensor_1}, @@ -284,26 +275,25 @@ TEST_SUITE(FF_TEST_SUITE) { tensor_attrs_3_with_grad.shape}, {TensorTypeVariant{grad_tensor}, tensor_attrs_3_with_grad.shape}, - {TensorTypeVariant{optimizer_tensor}, + {TensorTypeVariant{optimizer_tensor_1}, tensor_attrs_3_with_grad.shape}, }; UnallocatedTensors correct = UnallocatedTensors{ correct_tensor_type_shapes, {{mock_tensor_3_with_grad, grad_tensor}}, - {{mock_tensor_3_with_grad, {optimizer_tensor}}}}; + {{mock_tensor_3_with_grad, {optimizer_tensor_1}}}}; CHECK(result == correct); } SUBCASE("allocated") { - OptimizerTensorSource optimizer_tensour_source; - optimizer_tensor_t optimizer_tensor = - optimizer_tensour_source.new_optimizer_tensor(); + AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{ - {{TensorTypeVariant{optimizer_tensor}, tensor_backing_3}}, + {{TensorTypeVariant{optimizer_tensor_1}, tensor_backing_3}}, {}, - {{mock_tensor_3_with_grad, {optimizer_tensor}}}}; - GradientTensorSource gradient_tensor_source; + {{mock_tensor_3_with_grad, {optimizer_tensor_1}}}}; + + gradient_tensor_source.reset(); UnallocatedTensors result = generate_unallocated_tensors_with_optimizer( allocated_optimizer_tensor, @@ -312,10 +302,6 @@ TEST_SUITE(FF_TEST_SUITE) { optimizer_tensor_source, attrs); - GradientTensorSource mock_gradient_tensor_source; - gradient_tensor_t grad_tensor = - mock_gradient_tensor_source.new_gradient_tensor(); - std::unordered_map correct_tensor_type_shapes = { {TensorTypeVariant{mock_tensor_1}, @@ -348,8 +334,9 @@ TEST_SUITE(FF_TEST_SUITE) { /*epsilon=*/1e-8}}; SUBCASE("Empty") { AllocatedTensors empty = AllocatedTensors{{}, {}, {}}; - GradientTensorSource gradient_tensor_source; - OptimizerTensorSource optimizer_tensour_source; + + gradient_tensor_source.reset(); + optimizer_tensor_source.reset(); UnallocatedTensors result = generate_unallocated_tensors_with_optimizer( empty, @@ -358,15 +345,6 @@ TEST_SUITE(FF_TEST_SUITE) { optimizer_tensor_source, attrs); - GradientTensorSource mock_gradient_tensor_source; - gradient_tensor_t grad_tensor = - mock_gradient_tensor_source.new_gradient_tensor(); - OptimizerTensorSource mock_optimizer_tensour_source; - optimizer_tensor_t optimizer_tensor_1 = - mock_optimizer_tensour_source.new_optimizer_tensor(); - optimizer_tensor_t optimizer_tensor_2 = - mock_optimizer_tensour_source.new_optimizer_tensor(); - std::unordered_map correct_tensor_type_shapes = { {TensorTypeVariant{mock_tensor_1}, @@ -391,14 +369,16 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(result == correct); } SUBCASE("Partially allocated") { - OptimizerTensorSource optimizer_tensour_source; - optimizer_tensor_t optimizer_tensor_1 = - optimizer_tensour_source.new_optimizer_tensor(); + gradient_tensor_source.reset(); + optimizer_tensor_source.reset(); + optimizer_tensor_t optimizer_tensor_pre_allocated = + optimizer_tensor_source.new_optimizer_tensor(); AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{ - {{TensorTypeVariant{optimizer_tensor_1}, tensor_backing_3}}, + {{TensorTypeVariant{optimizer_tensor_pre_allocated}, + tensor_backing_3}}, {}, - {{mock_tensor_3_with_grad, {optimizer_tensor_1}}}}; - GradientTensorSource gradient_tensor_source; + {{mock_tensor_3_with_grad, {optimizer_tensor_pre_allocated}}}}; + UnallocatedTensors result = generate_unallocated_tensors_with_optimizer( allocated_optimizer_tensor, @@ -407,14 +387,6 @@ TEST_SUITE(FF_TEST_SUITE) { optimizer_tensor_source, attrs); - GradientTensorSource mock_gradient_tensor_source; - gradient_tensor_t grad_tensor = - mock_gradient_tensor_source.new_gradient_tensor(); - OptimizerTensorSource mock_optimizer_tensour_source; - optimizer_tensor_source.new_optimizer_tensor(); - optimizer_tensor_t optimizer_tensor_2 = - optimizer_tensour_source.new_optimizer_tensor(); - std::unordered_map correct_tensor_type_shapes = { {TensorTypeVariant{mock_tensor_1}, @@ -437,18 +409,14 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("Fully allocated") { - OptimizerTensorSource optimizer_tensour_source; - optimizer_tensor_t optimizer_tensor_1 = - optimizer_tensour_source.new_optimizer_tensor(); - optimizer_tensor_t optimizer_tensor_2 = - optimizer_tensour_source.new_optimizer_tensor(); AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{ {{TensorTypeVariant{optimizer_tensor_1}, tensor_backing_3}, {TensorTypeVariant{optimizer_tensor_2}, tensor_backing_3}}, {}, {{mock_tensor_3_with_grad, {optimizer_tensor_1, optimizer_tensor_2}}}}; - GradientTensorSource gradient_tensor_source; + + gradient_tensor_source.reset(); UnallocatedTensors result = generate_unallocated_tensors_with_optimizer( allocated_optimizer_tensor, @@ -457,10 +425,6 @@ TEST_SUITE(FF_TEST_SUITE) { optimizer_tensor_source, attrs); - GradientTensorSource mock_gradient_tensor_source; - gradient_tensor_t grad_tensor = - mock_gradient_tensor_source.new_gradient_tensor(); - std::unordered_map correct_tensor_type_shapes = { {TensorTypeVariant{mock_tensor_1}, diff --git a/lib/pcg/src/pcg/computation_graph.cc b/lib/pcg/src/pcg/computation_graph.cc index 728a150c2a..1cb7bb6d2a 100644 --- a/lib/pcg/src/pcg/computation_graph.cc +++ b/lib/pcg/src/pcg/computation_graph.cc @@ -85,7 +85,7 @@ std::vector get_incoming_tensors(ComputationGraph const &cg, } std::vector get_incoming_input_shapes(ComputationGraph const &cg, - layer_guid_t n) { + layer_guid_t const &n) { return transform(get_incoming_inputs(cg, n), [&](tensor_guid_t const &t) { return get_tensor_attrs(cg, t).shape; }); From f28e5c2fbfaa8065dc1ea8d33a0b669e9b763ffe Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Thu, 13 Feb 2025 14:34:02 -0800 Subject: [PATCH 45/91] Update task registry tests --- .../include/local-execution/task_registry.h | 3 +- .../src/local_training_backing.cc | 6 +- lib/local-execution/src/task_registry.cc | 15 ++-- lib/local-execution/test/CMakeLists.txt | 2 + .../test/src/test_task_registry.cc | 77 ++++++++++++------- lib/pcg/include/pcg/computation_graph.h | 3 + lib/pcg/src/pcg/computation_graph.cc | 9 +++ 7 files changed, 77 insertions(+), 38 deletions(-) diff --git a/lib/local-execution/include/local-execution/task_registry.h b/lib/local-execution/include/local-execution/task_registry.h index 56e98ba8da..eb3e0859d0 100644 --- a/lib/local-execution/include/local-execution/task_registry.h +++ b/lib/local-execution/include/local-execution/task_registry.h @@ -9,7 +9,8 @@ namespace FlexFlow { -TaskRegistry construct_task_registry(ComputationGraph const &); +TaskRegistry construct_task_registry( + std::unordered_map const &); bool registry_contains_task_for_layer(TaskRegistry const &, layer_guid_t const &, diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index 23db484d0b..f09234b920 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -20,7 +20,8 @@ LocalTrainingBacking::LocalTrainingBacking( ComputationGraph const &computation_graph, RuntimeArgConfig const &runtime_arg_config) : computation_graph(computation_graph), - task_registry(construct_task_registry(computation_graph)), + task_registry(construct_task_registry( + get_layer_attrs_mapping(this->computation_graph))), local_tensor_backing(allocated_tensors, generate_unallocated_tensors( allocated_tensors, @@ -39,7 +40,8 @@ LocalTrainingBacking::LocalTrainingBacking( RuntimeArgConfig const &runtime_arg_config, OptimizerAttrs const &optimizer_attrs) : computation_graph(computation_graph), - task_registry(construct_task_registry(computation_graph)), + task_registry(construct_task_registry( + get_layer_attrs_mapping(this->computation_graph))), local_tensor_backing(allocated_tensors, generate_unallocated_tensors_with_optimizer( allocated_tensors, diff --git a/lib/local-execution/src/task_registry.cc b/lib/local-execution/src/task_registry.cc index f33aef8460..487bd4420e 100644 --- a/lib/local-execution/src/task_registry.cc +++ b/lib/local-execution/src/task_registry.cc @@ -4,19 +4,22 @@ namespace FlexFlow { -TaskRegistry construct_task_registry(ComputationGraph const &cg) { +TaskRegistry construct_task_registry( + std::unordered_map const &layer_attrs_mapping) { std::unordered_map> init_task_ids; std::unordered_map> fwd_task_ids; std::unordered_map> bwd_task_ids; std::unordered_map task_mapping; - for (layer_guid_t const &node : topological_ordering(cg)) { + for (std::pair const &layer_attrs : + layer_attrs_mapping) { + layer_guid_t node = layer_attrs.first; init_task_ids.insert({node, std::nullopt}); fwd_task_ids.insert({node, std::nullopt}); bwd_task_ids.insert({node, std::nullopt}); - ComputationGraphOpAttrs attrs = get_layer_attrs(cg, node).attrs; + ComputationGraphOpAttrs attrs = layer_attrs.second.attrs; std::vector task_ids = get_task_ids(attrs); for (task_id_t const &task_id : task_ids) { @@ -29,13 +32,13 @@ TaskRegistry construct_task_registry(ComputationGraph const &cg) { break; case OpTaskType::FWD: assert(is_invocation_valid(task_signature_impl.task_signature, - init(attrs))); + forward(attrs))); fwd_task_ids[node] = task_id; break; case OpTaskType::BWD: assert(is_invocation_valid(task_signature_impl.task_signature, - init(attrs))); - fwd_task_ids[node] = task_id; + backward(attrs))); + bwd_task_ids[node] = task_id; break; default: throw mk_runtime_error( diff --git a/lib/local-execution/test/CMakeLists.txt b/lib/local-execution/test/CMakeLists.txt index 6e3d890176..fc647cff9b 100644 --- a/lib/local-execution/test/CMakeLists.txt +++ b/lib/local-execution/test/CMakeLists.txt @@ -4,7 +4,9 @@ ff_add_test_executable( SRC_PATTERNS src/test_allocated_tensors.cc src/test_unallocated_tensors.cc + src/test_task_registry.cc src/test_utils.cc + src/test_local_task_arg_accessor.cc PRIVATE_INCLUDE src/ DEPS diff --git a/lib/local-execution/test/src/test_task_registry.cc b/lib/local-execution/test/src/test_task_registry.cc index 16325d4763..20b4f11a2a 100644 --- a/lib/local-execution/test/src/test_task_registry.cc +++ b/lib/local-execution/test/src/test_task_registry.cc @@ -10,7 +10,6 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Task Registry") { - TaskRegistry task_registry = empty_task_registry(); layer_guid_t layer_guid = layer_guid_t{Node{0}}; nonnegative_int embed_dim = 32_n; @@ -28,7 +27,8 @@ TEST_SUITE(FF_TEST_SUITE) { }}; SUBCASE("register single layer") { - register_tasks_for_layer(task_registry, layer_guid, attrs); + TaskRegistry task_registry = construct_task_registry( + {{layer_guid, LayerAttrs{attrs, std::nullopt}}}); TaskRegistry correct_task_registry = [&] { std::unordered_map> @@ -53,8 +53,10 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("multiple layers same task") { layer_guid_t other_layer_guid = layer_guid_t{Node{1}}; - register_tasks_for_layer(task_registry, layer_guid, attrs); - register_tasks_for_layer(task_registry, other_layer_guid, attrs); + TaskRegistry task_registry = construct_task_registry({ + {layer_guid, LayerAttrs{attrs, std::nullopt}}, + {other_layer_guid, LayerAttrs{attrs, std::nullopt}}, + }); SUBCASE("layer to task ids") { std::unordered_map> correct = { @@ -64,6 +66,39 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(correct == task_registry.init_task_ids); } + SUBCASE("task to signature+impl mapping") { + std::unordered_map + correct_task_mapping = { + {task_id_t::ATTENTION_INIT_TASK_ID, + get_task_sig_impl(task_id_t::ATTENTION_INIT_TASK_ID)}, + {task_id_t::ATTENTION_FWD_TASK_ID, + get_task_sig_impl(task_id_t::ATTENTION_FWD_TASK_ID)}, + {task_id_t::ATTENTION_BWD_TASK_ID, + get_task_sig_impl(task_id_t::ATTENTION_BWD_TASK_ID)}}; + CHECK(correct_task_mapping == task_registry.task_mapping); + } + } + SUBCASE("different attrs, still same task fn mapping") { + layer_guid_t layer_1 = layer_guid_t{Node{1}}; + nonnegative_int embed_dim = 100_n; + layer_guid_t layer_2 = layer_guid_t{Node{2}}; + ComputationGraphOpAttrs other_attrs = + ComputationGraphOpAttrs{MultiHeadAttentionAttrs{ + /*embed_dim=*/embed_dim, + /*num_heads=*/num_heads, + /*kdim=*/embed_dim, + /*vdim=*/embed_dim, + /*dropout=*/0.0, + /*bias=*/true, + /*add_bias_kv=*/false, + /*add_zero_attn=*/false, + }}; + TaskRegistry task_registry = construct_task_registry({ + {layer_guid, LayerAttrs{attrs, std::nullopt}}, + {layer_1, LayerAttrs{attrs, std::nullopt}}, + {layer_2, LayerAttrs{other_attrs, std::nullopt}}, + }); + std::unordered_map correct_task_mapping = {{task_id_t::ATTENTION_INIT_TASK_ID, get_task_sig_impl(task_id_t::ATTENTION_INIT_TASK_ID)}, @@ -71,31 +106,11 @@ TEST_SUITE(FF_TEST_SUITE) { get_task_sig_impl(task_id_t::ATTENTION_FWD_TASK_ID)}, {task_id_t::ATTENTION_BWD_TASK_ID, get_task_sig_impl(task_id_t::ATTENTION_BWD_TASK_ID)}}; - SUBCASE("task to signature+impl mapping") { - CHECK(correct_task_mapping == task_registry.task_mapping); - } - SUBCASE("different attrs, still same task fn mapping") { - nonnegative_int embed_dim = 100_n; - layer_guid_t layer_3 = layer_guid_t{Node{3}}; - ComputationGraphOpAttrs other_attrs = - ComputationGraphOpAttrs{MultiHeadAttentionAttrs{ - /*embed_dim=*/embed_dim, - /*num_heads=*/num_heads, - /*kdim=*/embed_dim, - /*vdim=*/embed_dim, - /*dropout=*/0.0, - /*bias=*/true, - /*add_bias_kv=*/false, - /*add_zero_attn=*/false, - }}; - register_tasks_for_layer(task_registry, layer_3, other_attrs); - CHECK(correct_task_mapping == task_registry.task_mapping); - } + CHECK(correct_task_mapping == task_registry.task_mapping); } SUBCASE("equality") { - TaskRegistry other_task_registry = empty_task_registry(); SUBCASE("different attrs is still equal") { nonnegative_int embed_dim = 100_n; ComputationGraphOpAttrs other_attrs = @@ -110,16 +125,20 @@ TEST_SUITE(FF_TEST_SUITE) { /*add_zero_attn=*/false, }}; - register_tasks_for_layer(task_registry, layer_guid, attrs); - register_tasks_for_layer(other_task_registry, layer_guid, other_attrs); + TaskRegistry task_registry = construct_task_registry( + {{layer_guid, LayerAttrs{attrs, std::nullopt}}}); + TaskRegistry other_task_registry = construct_task_registry( + {{layer_guid, LayerAttrs{other_attrs, std::nullopt}}}); CHECK(task_registry == other_task_registry); } SUBCASE("different layer_guid is not equal") { - register_tasks_for_layer(task_registry, layer_guid, attrs); + TaskRegistry task_registry = construct_task_registry( + {{layer_guid, LayerAttrs{attrs, std::nullopt}}}); layer_guid_t other_layer_guid = layer_guid_t{Node{1}}; - register_tasks_for_layer(other_task_registry, other_layer_guid, attrs); + TaskRegistry other_task_registry = construct_task_registry( + {{other_layer_guid, LayerAttrs{attrs, std::nullopt}}}); CHECK(task_registry != other_task_registry); } diff --git a/lib/pcg/include/pcg/computation_graph.h b/lib/pcg/include/pcg/computation_graph.h index 589496e61b..e9ee69134d 100644 --- a/lib/pcg/include/pcg/computation_graph.h +++ b/lib/pcg/include/pcg/computation_graph.h @@ -57,6 +57,9 @@ std::unordered_set LayerAttrs get_layer_attrs(ComputationGraph const &cg, layer_guid_t const &n); +std::unordered_map + get_layer_attrs_mapping(ComputationGraph const &cg); + layer_guid_t get_layer_by_name(ComputationGraph const &cg, std::string const &name); diff --git a/lib/pcg/src/pcg/computation_graph.cc b/lib/pcg/src/pcg/computation_graph.cc index 1cb7bb6d2a..b932910499 100644 --- a/lib/pcg/src/pcg/computation_graph.cc +++ b/lib/pcg/src/pcg/computation_graph.cc @@ -190,6 +190,15 @@ LayerAttrs get_layer_attrs(ComputationGraph const &cg, layer_guid_t const &n) { return cg.raw_graph.at(n.raw_node); } +std::unordered_map + get_layer_attrs_mapping(ComputationGraph const &cg) { + std::unordered_map layer_attrs_mapping; + for (layer_guid_t const &layer_guid : get_layers(cg)) { + layer_attrs_mapping.insert({layer_guid, get_layer_attrs(cg, layer_guid)}); + } + return layer_attrs_mapping; +} + layer_guid_t get_layer_by_name(ComputationGraph const &cg, std::string const &name) { std::unordered_set found = From 9c16d7682543092fdfa67dc104066779fc32442b Mon Sep 17 00:00:00 2001 From: fruitea Date: Wed, 19 Feb 2025 11:31:03 -0800 Subject: [PATCH 46/91] feat: intial implementation of realm-backend --- .../include/realm-backend/allocated_tensors.h | 30 ++ .../allocated_tensors.struct.toml | 32 ++ .../realm-backend/model_training_instance.h | 19 +- .../realm-backend/realm_args_backing.h | 34 +- .../realm_task_argument_accessor.h | 19 +- .../realm-backend/realm_tensor_backing.h | 82 +-- .../realm-backend/realm_training_backing.h | 57 ++- .../include/realm-backend/task_result.h | 1 + .../unallocated_tensors.struct.toml | 31 ++ lib/realm-backend/src/allocated_tensors.cc | 141 ++++++ .../src/model_training_instance.cc | 81 ++- lib/realm-backend/src/realm_args_backing.cc | 84 ++-- .../src/realm_tensor_backing copy.cc | 142 ++++++ lib/realm-backend/src/realm_tensor_backing.cc | 286 +++++++---- .../src/realm_training_backing.cc | 471 +++++++++++------- 15 files changed, 1042 insertions(+), 468 deletions(-) create mode 100644 lib/realm-backend/include/realm-backend/allocated_tensors.h create mode 100644 lib/realm-backend/include/realm-backend/allocated_tensors.struct.toml create mode 100644 lib/realm-backend/include/realm-backend/unallocated_tensors.struct.toml create mode 100644 lib/realm-backend/src/allocated_tensors.cc create mode 100644 lib/realm-backend/src/realm_tensor_backing copy.cc diff --git a/lib/realm-backend/include/realm-backend/allocated_tensors.h b/lib/realm-backend/include/realm-backend/allocated_tensors.h new file mode 100644 index 0000000000..8effd06954 --- /dev/null +++ b/lib/realm-backend/include/realm-backend/allocated_tensors.h @@ -0,0 +1,30 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_ALLOCATED_TENSORS_H +#define _FLEXFLOW_LOCAL_EXECUTION_ALLOCATED_TENSORS_H + +#include "realm-backend/allocated_tensors.dtg.h" +#include "pcg/computation_graph.h" + +namespace FlexFlow { + +bool are_allocated_forward_tensors_valid( + AllocatedTensors const &, + std::unordered_map const &); +bool are_allocated_gradient_tensors_valid( + AllocatedTensors const &, + std::unordered_map const &); +bool are_allocated_optimizer_tensors_valid( + AllocatedTensors const &, + std::unordered_map const &); + +bool are_allocated_tensors_valid( + AllocatedTensors const &, + std::unordered_map const &); + +bool is_allocated_tensor_backing_valid( + TensorTypeVariant const &, + std::unordered_map const &, + ArrayShape const &); + +} // namespace FlexFlow + +#endif diff --git a/lib/realm-backend/include/realm-backend/allocated_tensors.struct.toml b/lib/realm-backend/include/realm-backend/allocated_tensors.struct.toml new file mode 100644 index 0000000000..d459027e5d --- /dev/null +++ b/lib/realm-backend/include/realm-backend/allocated_tensors.struct.toml @@ -0,0 +1,32 @@ +namespace = "FlexFlow" +name = "AllocatedTensors" +features = [ + "eq", + "fmt", + "hash", +] + +includes = [ + "task-spec/tensor_type_t.dtg.h", + "kernels/accessor.h", + "realm-backend/realm_allocator.h" +] + +src_includes = [ + "utils/hash/unordered_map.h", + "utils/fmt/unordered_map.h", + "utils/hash/vector.h", + "utils/fmt/vector.h" +] + +[[fields]] +name = "tensor_type_backings" +type = "std::unordered_map<::FlexFlow::TensorTypeVariant, std::pair<::FlexFlow::RealmRegion,::FlexFlow::TensorShape>>" + +[[fields]] +name = "gradient_mapping" +type = "std::unordered_map<::FlexFlow::tensor_guid_t, ::FlexFlow::gradient_tensor_t>" + +[[fields]] +name = "optimizer_mapping" +type = "std::unordered_map<::FlexFlow::tensor_guid_t, std::vector<::FlexFlow::optimizer_tensor_t>>" diff --git a/lib/realm-backend/include/realm-backend/model_training_instance.h b/lib/realm-backend/include/realm-backend/model_training_instance.h index aa3876fb0d..a35cada2d2 100644 --- a/lib/realm-backend/include/realm-backend/model_training_instance.h +++ b/lib/realm-backend/include/realm-backend/model_training_instance.h @@ -12,26 +12,23 @@ using PerLayerElapsedTime = std::unordered_map>; struct ModelTrainingInstance { - ModelTrainingInstance(ComputationGraph const &, - RuntimeArgConfig const &, - LossAttrs const &, + ModelTrainingInstance(RealmTrainingBacking const &, tensor_guid_t const &logit_tensor, - loss_tensor_t const &label_tensor, + TensorShape const &label_tensor_shape, + LossAttrs const &, OptimizerAttrs const &); - void execute_init(); - PerLayerElapsedTime execute_forward(); - PerLayerElapsedTime execute_backward(); - void execute_update(); - - ComputationGraph computation_graph; RealmTrainingBacking training_backing; - LossAttrs loss_attrs; tensor_guid_t logit_tensor; loss_tensor_t label_tensor; + LossAttrs loss_attrs; OptimizerAttrs optimizer_attrs; }; +PerLayerElapsedTime forward(ModelTrainingInstance &); +PerLayerElapsedTime backward(ModelTrainingInstance &); +void update(ModelTrainingInstance &); + } // namespace FlexFlow #endif diff --git a/lib/realm-backend/include/realm-backend/realm_args_backing.h b/lib/realm-backend/include/realm-backend/realm_args_backing.h index 88db880fcb..75f954c0ad 100644 --- a/lib/realm-backend/include/realm-backend/realm_args_backing.h +++ b/lib/realm-backend/include/realm-backend/realm_args_backing.h @@ -1,38 +1,38 @@ #ifndef _FLEXFLOW_REALM_BACKEND_REALM_ARGS_BACKING_H #define _FLEXFLOW_REALM_BACKEND_REALM_ARGS_BACKING_H -#include "local-execution/op_task_invocation.h" -#include "local-execution/per_device_op_state.h" -#include "local-execution/runtime_arg_config.h" -#include "local-execution/task_invocation.dtg.h" #include "pcg/computation_graph.h" #include "pcg/layer_guid_t.dtg.h" #include "realm-backend/realm_task_argument_accessor.h" #include "realm-backend/task_result.h" +#include "task-spec/op_task_invocation.h" +#include "task-spec/per_device_op_state.h" +#include "task-spec/runtime_arg_config.h" +#include "task-spec/task_invocation.dtg.h" namespace FlexFlow { struct RealmArgsBacking { - RealmArgsBacking(RuntimeArgConfig const &); - -public: - void add_per_device_op_state(layer_guid_t const &, - Future &&); - - ArgSlotsBacking construct_arg_slots_backing(TaskBinding const &) const; - - ConcreteArgSpec lower_to_concrete_arg_spec(RuntimeArgRefSpec const &) const; - ConcreteArgSpec lower_to_concrete_arg_spec(OpArgRefSpec const &, - ComputationGraph const &, - layer_guid_t const &) const; + RealmArgsBacking(RuntimeArgConfig const &, + std::unordered_map const &); public: // arguments + RuntimeArgConfig runtime_arg_config; std::unordered_map per_device_op_states; - RuntimeArgConfig runtime_arg_config; }; +RealmArgsBacking +make_args_backing_with_empty_device_states(RuntimeArgConfig const &); + +std::optional +get_per_device_op_state_if_exists(RealmArgsBacking const &, + layer_guid_t const &); + +ArgSlotsBacking construct_arg_slots_backing(TaskBinding const &, + RuntimeArgConfig const &); + } // namespace FlexFlow #endif diff --git a/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h b/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h index 5c7ecafd0f..ce826e162e 100644 --- a/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h +++ b/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h @@ -1,8 +1,9 @@ #ifndef _FLEXFLOW_REALM_BACKEND_REALM_TASK_ARGUMENT_ACCESSOR_H #define _FLEXFLOW_REALM_BACKEND_REALM_TASK_ARGUMENT_ACCESSOR_H -#include "local-execution/slot_tensor_type_id.dtg.h" #include "local-execution/task_argument_accessor.h" +#include "realm-backend/realm_allocator.h" +#include "task-spec/slot_tensor_type_id.dtg.h" #include #include @@ -14,7 +15,7 @@ using TensorSlotsBacking = std::unordered_map< using ArgSlotsBacking = std::unordered_map; struct RealmTaskArgumentAccessor : public ITaskArgumentAccessor { - RealmTaskArgumentAccessor(Allocator const &allocator, + RealmTaskArgumentAccessor(RealmAllocator const &allocator, TensorSlotsBacking const &tensor_slots_backing, ArgSlotsBacking const &arg_slots_backing); @@ -23,18 +24,18 @@ struct RealmTaskArgumentAccessor : public ITaskArgumentAccessor { ConcreteArgSpec const &get_concrete_arg(slot_id_t) const override; - GenericTensorAccessor get_tensor(slot_id_t slot, - Permissions priv, + GenericTensorAccessor get_tensor(slot_id_t slot, Permissions priv, TensorType tensor_type) const override; - VariadicGenericTensorAccessor get_variadic_tensor( - slot_id_t slot, Permissions priv, TensorType tensor_type) const override; + VariadicGenericTensorAccessor + get_variadic_tensor(slot_id_t slot, Permissions priv, + TensorType tensor_type) const override; Allocator get_allocator() const override; size_t get_device_idx() const override; private: - Allocator allocator; + RealmAllocator allocator; TensorSlotsBacking tensor_slots_backing; ArgSlotsBacking arg_slots_backing; }; @@ -45,8 +46,8 @@ using TensorSlotsBackingWithoutAddresses = std::unordered_map< std::vector>>>; TensorSlotsBackingWithoutAddresses - get_slots_backing_without_tensor_allocation_addresses( - TensorSlotsBacking const &); +get_slots_backing_without_tensor_allocation_addresses( + TensorSlotsBacking const &); CHECK_RC_COPY_VIRTUAL_COMPLIANT(RealmTaskArgumentAccessor); diff --git a/lib/realm-backend/include/realm-backend/realm_tensor_backing.h b/lib/realm-backend/include/realm-backend/realm_tensor_backing.h index d9df0dfcb1..25136ad2ff 100644 --- a/lib/realm-backend/include/realm-backend/realm_tensor_backing.h +++ b/lib/realm-backend/include/realm-backend/realm_tensor_backing.h @@ -3,58 +3,70 @@ #define _FLEXFLOW_REALM_BACKEND_REALM_TENSOR_BACKING_H #include "kernels/accessor.h" -#include "realm-backend/realm_task_argument_accessor.h" -#include "realm-backend/realm_allocator.h" -#include "local-execution/task_invocation.dtg.h" -#include "local-execution/tensor_role.dtg.h" -#include "local-execution/lowered_tensor_t.dtg.h" +#include "local-execution/gradient_tensor_source.h" +#include "local-execution/loss_tensor_source.h" #include "local-execution/lowered_tensor_source.h" -#include "local-execution/optimizer_tensor_t.dtg.h" -#include "local-execution/loss_tensor_t.dtg.h" +#include "local-execution/optimizer_tensor_source.h" #include "pcg/computation_graph.dtg.h" -#include "pcg/tensor_guid_t.dtg.h" #include "pcg/layer_guid_t.dtg.h" +#include "pcg/optimizer_attrs.dtg.h" +#include "realm-backend/allocated_tensors.dtg.h" +#include "realm-backend/realm_allocator.h" +#include "realm-backend/realm_task_argument_accessor.h" +#include "realm-backend/unallocated_tensors.dtg.h" +#include "task-spec/lowered_tensor_t.dtg.h" +#include "task-spec/task_invocation.dtg.h" +#include "task-spec/tensor_role.dtg.h" namespace FlexFlow { -using TensorRegionMap = - std::unordered_map; -using TensorShapeMap = - std::unordered_map; +using TensorBackingMap = std::unordered_map>; struct RealmTensorBacking { - RealmTensorBacking(); + RealmTensorBacking(AllocatedTensors const &, UnallocatedTensors const &, + RealmAllocator const &); public: - void allocate_layer_tensors(layer_guid_t const &, - ComputationGraph const &, - RealmAllocator &); - void allocate_tensors_by_role(TensorRole const &, - layer_guid_t const &, - ComputationGraph const &, - RealmAllocator &); - void allocate_optimizer_tensors(tensor_guid_t const &, - std::vector const &, - RealmAllocator &); - TensorSlotsBacking - construct_tensor_slots_backing(TaskBinding const &) const; - - GenericTensorAccessorW const & - get_tensor_backing(lowered_tensor_t const &) const; - - bool is_tensor_allocated(lowered_tensor_t const &) const; + GenericTensorAccessorW get_tensor(TensorTypeVariant const &) const; public: // tensors - TensorRegionMap tensor_regions; - TensorShapeMap tensor_shapes; + TensorBackingMap tensor_backings; + std::unordered_map tensor_lowering_mapping; - std::unordered_map gradient_tensor_lowering_mapping; - std::unordered_map optimizer_tensor_lowering_mapping; - std::unordered_map loss_tensor_lowering_mapping; + std::unordered_map + gradient_tensor_lowering_mapping; + std::unordered_map + optimizer_tensor_lowering_mapping; + std::unordered_map + loss_tensor_lowering_mapping; + + std::unordered_map tensor_gradient_mapping; + std::unordered_map> + tensor_optimizer_mapping; + + RealmAllocator allocator; + +private: + lowered_tensor_t insert_tensor(TensorTypeVariant const &); LoweredTensorSource lowered_tensor_source; }; +GenericTensorAccessorW wrappup_tensor_accessor(std::pair const &); + +UnallocatedTensors generate_unallocated_tensors( + AllocatedTensors const &, + std::unordered_map const &, + GradientTensorSource &); + +UnallocatedTensors generate_unallocated_tensors_with_optimizer( + AllocatedTensors const &, + std::unordered_map const &, + GradientTensorSource &, OptimizerTensorSource &, OptimizerAttrs const &); + +TensorSlotsBacking construct_tensor_slots_backing(RealmTensorBacking const &, + TaskBinding const &); + } // namespace FlexFlow #endif diff --git a/lib/realm-backend/include/realm-backend/realm_training_backing.h b/lib/realm-backend/include/realm-backend/realm_training_backing.h index ddd3bb7ed1..81df422b7a 100644 --- a/lib/realm-backend/include/realm-backend/realm_training_backing.h +++ b/lib/realm-backend/include/realm-backend/realm_training_backing.h @@ -6,6 +6,7 @@ #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" #include "pcg/computation_graph.dtg.h" #include "pcg/optimizer_attrs.dtg.h" +#include "realm-backend/allocated_tensors.dtg.h" #include "realm-backend/driver.h" #include "realm-backend/realm_allocator.h" #include "realm-backend/realm_args_backing.h" @@ -18,28 +19,14 @@ using PerLayerElapsedTime = std::unordered_map>; struct RealmTrainingBacking { - RealmTrainingBacking(ComputationGraph const &, RuntimeArgConfig const &, - Realm::Processor); - void register_and_allocate_layer(layer_guid_t const &); - void allocate_layer_optimizer_tensors(layer_guid_t const &, - OptimizerAttrs const &); + RealmTrainingBacking(Realm::Processor, AllocatedTensors const &, + ComputationGraph const &, RuntimeArgConfig const &); - void execute_init(layer_guid_t const &); - Future> execute_forward(layer_guid_t const &); - Future> execute_backward(layer_guid_t const &); - Future execute_update(layer_guid_t const &, OptimizerAttrs const &); - Future compute_loss(LossAttrs const &loss_attrs, - tensor_guid_t const &logit_tensor, - loss_tensor_t const &label_tensor); - - TaskArgumentAccessor get_task_arg_accessor(TaskInvocation const &) const; - - TaskInvocation lower_to_task_invocation(OpTaskInvocation const &, - layer_guid_t const &) const; - - ComputationGraph computation_graph; - TaskRegistry task_registry; + RealmTrainingBacking(Realm::Processor, AllocatedTensors const &, + ComputationGraph const &, RuntimeArgConfig const &, + OptimizerAttrs const &); +public: // runtime Realm::Processor master_proc; Realm::Memory master_mem; @@ -47,18 +34,34 @@ struct RealmTrainingBacking { std::unordered_map proc_events; std::vector allocators; - // storage RealmTensorBacking realm_tensor_backing; RealmArgsBacking realm_args_backing; - OptimizerTensorSource optimizer_tensor_source; - std::unordered_map> - layer_optimizer_tensor_ids; -private: - std::optional call_task_impl(task_id_t, TaskSignatureAndImpl, - TaskArgumentAccessor); + ComputationGraph computation_graph; + TaskRegistry task_registry; + + GradientTensorSource gradient_tensor_source; + OptimizerTensorSource optimizer_tensor_source; }; +RealmArgsBacking initialize_args_backing(RealmTrainingBacking *, + RuntimeArgConfig const &); + +void execute_init(RealmTrainingBacking &, layer_guid_t const &); +Future> execute_forward(RealmTrainingBacking &, + layer_guid_t const &); +Future> execute_backward(RealmTrainingBacking &, + layer_guid_t const &); +Future compute_loss(RealmTrainingBacking &, LossAttrs const &, + tensor_guid_t const &logit_tensor, + loss_tensor_t const &label_tensor); +Future execute_update(RealmTrainingBacking &, layer_guid_t const &, + OptimizerAttrs const &); + +TaskArgumentAccessor get_task_arg_accessor(RealmTensorBacking const &, + RealmArgsBacking const &, + TaskInvocation const &); + } // namespace FlexFlow #endif diff --git a/lib/realm-backend/include/realm-backend/task_result.h b/lib/realm-backend/include/realm-backend/task_result.h index 5fb158496e..4cf8916f85 100644 --- a/lib/realm-backend/include/realm-backend/task_result.h +++ b/lib/realm-backend/include/realm-backend/task_result.h @@ -3,6 +3,7 @@ #include "realm-backend/driver.h" #include +#include namespace FlexFlow { diff --git a/lib/realm-backend/include/realm-backend/unallocated_tensors.struct.toml b/lib/realm-backend/include/realm-backend/unallocated_tensors.struct.toml new file mode 100644 index 0000000000..e86cc2a532 --- /dev/null +++ b/lib/realm-backend/include/realm-backend/unallocated_tensors.struct.toml @@ -0,0 +1,31 @@ +namespace = "FlexFlow" +name = "UnallocatedTensors" +features = [ + "eq", + "fmt", + "hash", +] + +includes = [ + "task-spec/tensor_type_t.dtg.h", + "op-attrs/tensor_shape.dtg.h" +] + +src_includes = [ + "utils/hash/unordered_map.h", + "utils/fmt/unordered_map.h", + "utils/hash/vector.h", + "utils/fmt/vector.h" +] + +[[fields]] +name = "tensor_type_shapes" +type = "std::unordered_map<::FlexFlow::TensorTypeVariant, ::FlexFlow::TensorShape>" + +[[fields]] +name = "gradient_mapping" +type = "std::unordered_map<::FlexFlow::tensor_guid_t, ::FlexFlow::gradient_tensor_t>" + +[[fields]] +name = "optimizer_mapping" +type = "std::unordered_map<::FlexFlow::tensor_guid_t, std::vector<::FlexFlow::optimizer_tensor_t>>" diff --git a/lib/realm-backend/src/allocated_tensors.cc b/lib/realm-backend/src/allocated_tensors.cc new file mode 100644 index 0000000000..f27db14643 --- /dev/null +++ b/lib/realm-backend/src/allocated_tensors.cc @@ -0,0 +1,141 @@ +#include "realm-backend/allocated_tensors.h" +#include "pcg/optimizer_attrs.h" +#include "utils/containers/keys.h" +#include "utils/containers/set_union.h" + +namespace FlexFlow { + +bool is_allocated_tensor_backing_valid( + TensorTypeVariant const &tensor_type, + std::unordered_map const + &allocated_tensor_backings, + ArrayShape const &expected_shape) { + if (allocated_tensor_backings.count(tensor_type)) { + GenericTensorAccessorW tensor_backing = + allocated_tensor_backings.at(tensor_type); + if (expected_shape == tensor_backing.shape) { + return true; + } + } + return false; +}; + +bool are_allocated_forward_tensors_valid( + AllocatedTensors const &allocated_tensors, + std::unordered_map const &tensor_attrs) { + + std::unordered_set all_tensor_guids = transform( + keys(filter_keys( + allocated_tensors.tensor_type_backings, + [&](TensorTypeVariant const &k) { return k.has(); })), + [&](TensorTypeVariant const &t) { return t.get(); }); + + for (tensor_guid_t const &tensor_guid : all_tensor_guids) { + if (tensor_attrs.count(tensor_guid)) { + if (!is_allocated_tensor_backing_valid( + TensorTypeVariant{tensor_guid}, + allocated_tensors.tensor_type_backings, + ArrayShape{tensor_attrs.at(tensor_guid).shape})) { + return false; + } + } else { + return false; + } + } + return true; +} + +bool are_allocated_gradient_tensors_valid( + AllocatedTensors const &allocated_tensors, + std::unordered_map const &tensor_attrs) { + std::unordered_set + tensors_in_mappings; // will check for dangling gradient tensors + + for (std::pair const &tensor_to_grad : + allocated_tensors.gradient_mapping) { + if (tensor_attrs.count(tensor_to_grad.first)) { + if (tensor_attrs.at(tensor_to_grad.first).create_gradients == + CreateGrad::NO) { + return false; + } + + ArrayShape tensor_guid_array_shape = + ArrayShape{tensor_attrs.at(tensor_to_grad.first).shape}; + TensorTypeVariant gradient_tensor = + TensorTypeVariant{tensor_to_grad.second}; + if (is_allocated_tensor_backing_valid( + gradient_tensor, + allocated_tensors.tensor_type_backings, + tensor_guid_array_shape)) { + tensors_in_mappings.insert(gradient_tensor); + } else { + return false; + } + } else { + return false; + } + } + + for (TensorTypeVariant const &tensor_type : + keys(allocated_tensors.tensor_type_backings)) { + if (tensor_type.has()) { + if (!tensors_in_mappings.count(tensor_type)) { + return false; + } + } + } + return true; +} + +bool are_allocated_optimizer_tensors_valid( + AllocatedTensors const &allocated_tensors, + std::unordered_map const &tensor_attrs) { + std::unordered_set + tensors_in_mappings; // will check for dangling optimizer tensors + + for (std::pair> const + &tensor_to_optimizers : allocated_tensors.optimizer_mapping) { + if (tensor_attrs.count(tensor_to_optimizers.first)) { + if (tensor_attrs.at(tensor_to_optimizers.first).create_gradients == + CreateGrad::NO) { + return false; + } + + ArrayShape tensor_guid_array_shape = + ArrayShape{tensor_attrs.at(tensor_to_optimizers.first).shape}; + for (optimizer_tensor_t const &optimizer_tensor : + tensor_to_optimizers.second) { + if (is_allocated_tensor_backing_valid( + TensorTypeVariant{optimizer_tensor}, + allocated_tensors.tensor_type_backings, + tensor_guid_array_shape)) { + tensors_in_mappings.insert(TensorTypeVariant{optimizer_tensor}); + } else { + return false; + } + } + } + } + + for (TensorTypeVariant const &tensor_type : + keys(allocated_tensors.tensor_type_backings)) { + if (tensor_type.has()) { + if (!tensors_in_mappings.count(tensor_type)) { + return false; + } + } + } + + return true; +} + +bool are_allocated_tensors_valid( + AllocatedTensors const &allocated_tensors, + std::unordered_map const &tensor_attrs) { + return are_allocated_forward_tensors_valid(allocated_tensors, tensor_attrs) && + are_allocated_gradient_tensors_valid(allocated_tensors, + tensor_attrs) && + are_allocated_optimizer_tensors_valid(allocated_tensors, tensor_attrs); +} + +} // namespace FlexFlow diff --git a/lib/realm-backend/src/model_training_instance.cc b/lib/realm-backend/src/model_training_instance.cc index f9c959c389..acb8edb314 100644 --- a/lib/realm-backend/src/model_training_instance.cc +++ b/lib/realm-backend/src/model_training_instance.cc @@ -5,43 +5,27 @@ namespace FlexFlow { -ModelTrainingInstance::ModelTrainingInstance( - ComputationGraph const &computation_graph, - RuntimeArgConfig const &runtime_arg_config, LossAttrs const &loss_attrs, - tensor_guid_t const &logit_tensor, loss_tensor_t const &label_tensor, + ModelTrainingInstance::ModelTrainingInstance( + RealmTrainingBacking const &realm_training_backing, + tensor_guid_t const &logit_tensor, + loss_tensor_t const &label_tensor, + LossAttrs const &loss_attrs, OptimizerAttrs const &optimizer_attrs) - : computation_graph(computation_graph), - training_backing(computation_graph, runtime_arg_config), - loss_attrs(loss_attrs), logit_tensor(logit_tensor), - label_tensor(label_tensor), optimizer_attrs(optimizer_attrs) { + : training_backing(realm_training_backing), loss_attrs(loss_attrs), + optimizer_attrs(optimizer_attrs), logit_tensor(logit_tensor), + label_tensor(label_tensor){}; - // allocate each layer's tensors - for (layer_guid_t const &node : - topological_ordering(this->computation_graph)) { - this->training_backing.register_and_allocate_layer(node); - this->training_backing.allocate_layer_optimizer_tensors( - node, this->optimizer_attrs); - } -} - -void ModelTrainingInstance::execute_init() { - for (layer_guid_t const &node : - topological_ordering(this->computation_graph)) { - this->training_backing.execute_init(node); - } -} - -PerLayerElapsedTime ModelTrainingInstance::execute_forward() { +PerLayerElapsedTime forward(ModelTrainingInstance &model_training_instance) { PerLayerElapsedTime per_layer_elapsed_time; std::unordered_map>> per_layer_elapsed_time_future; - for (layer_guid_t const &node : - topological_ordering(this->computation_graph)) { + for (layer_guid_t const &node : topological_ordering( + model_training_instance.training_backing.computation_graph)) { per_layer_elapsed_time_future.insert( - {node, this->training_backing.execute_forward(node)}); + {node, execute_forward(model_training_instance.training_backing, node)}); } - for (layer_guid_t const &node : - topological_ordering(this->computation_graph)) { + for (layer_guid_t const &node : topological_ordering( + model_training_instance.training_backing.computation_graph)) { std::optional elapsed_time = per_layer_elapsed_time_future[node].get(); per_layer_elapsed_time.insert({node, elapsed_time}); @@ -49,19 +33,22 @@ PerLayerElapsedTime ModelTrainingInstance::execute_forward() { return per_layer_elapsed_time; } -PerLayerElapsedTime ModelTrainingInstance::execute_backward() { - this->training_backing.compute_loss(this->loss_attrs, this->logit_tensor, - this->label_tensor); +PerLayerElapsedTime backward(ModelTrainingInstance &model_training_instance) { + compute_loss(model_training_instance.training_backing, + model_training_instance.loss_attrs, + model_training_instance.logit_tensor, + model_training_instance.label_tensor); + PerLayerElapsedTime per_layer_elapsed_time; std::unordered_map>> per_layer_elapsed_time_future; - for (layer_guid_t const &node : - reversed(topological_ordering(this->computation_graph))) { + for (layer_guid_t const &node : reversed(topological_ordering( + model_training_instance.training_backing.computation_graph))) { per_layer_elapsed_time_future.insert( - {node, this->training_backing.execute_backward(node)}); + {node, execute_backward(model_training_instance.training_backing, node)}); } - for (layer_guid_t const &node : - reversed(topological_ordering(this->computation_graph))) { + for (layer_guid_t const &node : reversed(topological_ordering( + model_training_instance.training_backing.computation_graph))) { std::optional elapsed_time = per_layer_elapsed_time_future[node].get(); per_layer_elapsed_time.insert({node, elapsed_time}); @@ -69,19 +56,21 @@ PerLayerElapsedTime ModelTrainingInstance::execute_backward() { return per_layer_elapsed_time; } -void ModelTrainingInstance::execute_update() { +void update(ModelTrainingInstance &model_training_instance) { std::unordered_map> per_layer_update_future; - for (layer_guid_t const &node : - topological_ordering(this->computation_graph)) { + for (layer_guid_t const &node : topological_ordering( + model_training_instance.training_backing.computation_graph)) { per_layer_update_future.insert( - {node, this->training_backing.execute_update(node, this->optimizer_attrs)}); + {node, execute_update(model_training_instance.training_backing, + node, + model_training_instance.optimizer_attrs)}); } - for (layer_guid_t const &node : - topological_ordering(this->computation_graph)) { + for (layer_guid_t const &node : topological_ordering( + model_training_instance.training_backing.computation_graph)) { per_layer_update_future[node].wait(); } - this->optimizer_attrs = - get_optimizer_attrs_for_next_iter(this->optimizer_attrs); + model_training_instance.optimizer_attrs = get_optimizer_attrs_for_next_iter( + model_training_instance.optimizer_attrs); } } // namespace FlexFlow diff --git a/lib/realm-backend/src/realm_args_backing.cc b/lib/realm-backend/src/realm_args_backing.cc index ae7022f4b0..e20fcdc14d 100644 --- a/lib/realm-backend/src/realm_args_backing.cc +++ b/lib/realm-backend/src/realm_args_backing.cc @@ -1,65 +1,55 @@ #include "op-attrs/parallel_tensor_shape.h" #include "realm-backend/realm_args_backing.h" +#include "task-spec/op_task_to_task_invocation.h" #include "utils/containers/contains_key.h" #include "utils/containers/map_values.h" #include "utils/overload.h" namespace FlexFlow { -void RealmArgsBacking::add_per_device_op_state( - layer_guid_t const &op_guid, Future &&future) { - if (per_device_op_states.find(op_guid) != per_device_op_states.end()) { - throw mk_runtime_error("Op state already exists"); - } - per_device_op_states.insert({op_guid, std::move(future)}); -} +// void RealmArgsBacking::add_per_device_op_state( +// layer_guid_t const &op_guid, Future &&future) +// { +// if (per_device_op_states.find(op_guid) != per_device_op_states.end()) { +// throw mk_runtime_error("Op state already exists"); +// } +// per_device_op_states.insert({op_guid, std::move(future)}); +// } -ArgSlotsBacking RealmArgsBacking::construct_arg_slots_backing( - TaskBinding const &binding) const { - return map_values(binding.get_arg_bindings(), - [&](TaskArgSpec const &arg_binding) { - return arg_binding.template visit( - overload{[&](RuntimeArgRefSpec const &s) { - return this->lower_to_concrete_arg_spec(s); - }, - [](ConcreteArgSpec const &s) { return s; }}); - }); - ; +RealmArgsBacking make_args_backing_with_empty_device_states( + RuntimeArgConfig const &runtime_arg_config) { +return RealmArgsBacking{runtime_arg_config, {}}; } -ConcreteArgSpec RealmArgsBacking::lower_to_concrete_arg_spec( - OpArgRefSpec const &op_arg_ref_spec, ComputationGraph const &cg, - layer_guid_t const &op_guid) const { - if (op_arg_ref_spec.holds()) { - assert(contains_key(this->per_device_op_states, op_guid)); - DeviceSpecificDeviceStates device_specific = - per_device_op_states.at(op_guid); - PerDeviceOpState device_state = - get_device_state_from_device_specific(device_specific, 0); - return ConcreteArgSpec::create(device_state); - } else if (op_arg_ref_spec.holds()) { - ParallelTensorShapeRefType index_op_arg_ref = - op_arg_ref_spec.get_ref_type().get(); - tensor_guid_t input_tensor = - get_incoming_inputs(cg, op_guid).at(index_op_arg_ref.idx); - TensorAttrs tensor_attrs = get_tensor_attrs(cg, input_tensor); - ParallelTensorShape shape = lift_to_parallel(tensor_attrs.shape); - return ConcreteArgSpec::create(shape); +RealmArgsBacking::RealmArgsBacking( + RuntimeArgConfig const &runtime_arg_config, + std::unordered_map const + &device_states) + : runtime_arg_config(runtime_arg_config), + per_device_op_states(device_states){}; + +std::optional get_per_device_op_state_if_exists( + RealmArgsBacking const &realm_args_backing, + layer_guid_t const &layer_guid) { + if (contains_key(realm_args_backing.per_device_op_states, layer_guid)) { + return realm_args_backing.per_device_op_states.at(layer_guid); } else { - throw mk_runtime_error("Unhandled op arg ref type"); + return std::nullopt; } } -ConcreteArgSpec RealmArgsBacking::lower_to_concrete_arg_spec( - RuntimeArgRefSpec const &runtime_arg_ref_spec) const { - if (runtime_arg_ref_spec.holds>()) { - return ConcreteArgSpec::create( - *(this->runtime_arg_config.ff_handle.get(0))); - } else if (runtime_arg_ref_spec.holds()) { - return ConcreteArgSpec::create(this->runtime_arg_config.profiling_settings); - } else { - throw mk_runtime_error("Unhandled runtime arg ref type"); - } +ArgSlotsBacking + construct_arg_slots_backing(TaskBinding const &binding, + RuntimeArgConfig const &runtime_arg_config) { + return map_values( + binding.get_arg_bindings(), [&](TaskArgSpec const &arg_binding) { + return arg_binding.template visit( + overload{[&](RuntimeArgRefSpec const &s) { + return lower_to_concrete_arg_spec(s, runtime_arg_config); + }, + [](ConcreteArgSpec const &s) { return s; }}); + }); + ; } } // namespace FlexFlow diff --git a/lib/realm-backend/src/realm_tensor_backing copy.cc b/lib/realm-backend/src/realm_tensor_backing copy.cc new file mode 100644 index 0000000000..bac16c6b69 --- /dev/null +++ b/lib/realm-backend/src/realm_tensor_backing copy.cc @@ -0,0 +1,142 @@ +#include "task-spec/slot_grad_id.dtg.h" + +#include "op-attrs/parallel_tensor_shape.h" +#include "pcg/computation_graph.h" +#include "pcg/optimizer_attrs.h" +#include "realm-backend/realm_allocator.h" +#include "realm-backend/realm_tensor_backing.h" +#include "utils/containers/contains_key.h" +#include "utils/containers/keys.h" +#include "utils/overload.h" + +namespace FlexFlow { + +RealmTensorBacking::RealmTensorBacking() {}; + +void RealmTensorBacking::allocate_layer_tensors( + layer_guid_t const &layer_guid, ComputationGraph const &computation_graph, + RealmAllocator &allocator) { + this->allocate_tensors_by_role(TensorRole::INPUT, layer_guid, + computation_graph, allocator); + this->allocate_tensors_by_role(TensorRole::WEIGHT, layer_guid, + computation_graph, allocator); + this->allocate_tensors_by_role(TensorRole::OUTPUT, layer_guid, + computation_graph, allocator); +} + +void RealmTensorBacking::allocate_tensors_by_role( + TensorRole const &role, layer_guid_t const &layer_guid, + ComputationGraph const &computation_graph, RealmAllocator &allocator) { + std::vector tensors; + switch (role) { + case TensorRole::INPUT: + tensors = get_incoming_inputs(computation_graph, layer_guid); + break; + case TensorRole::WEIGHT: + tensors = get_incoming_weights(computation_graph, layer_guid); + break; + case TensorRole::OUTPUT: + tensors = get_outgoing_tensors(computation_graph, layer_guid); + break; + default: + throw mk_runtime_error("Invalid tensor role, got {}", role); + } + + for (tensor_guid_t const &tensor : tensors) { + TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor); + // tensor allocation + if (!contains_key(this->tensor_lowering_mapping, tensor)) { + lowered_tensor_t reduced_tensor = + this->lowered_tensor_source.new_lowered_tensor(); + this->tensor_lowering_mapping.insert({tensor, reduced_tensor}); + RealmRegion region = + allocator.allocate(get_size_in_bytes(tensor_attrs.shape)); + this->tensor_regions.insert({reduced_tensor, region}); + this->tensor_shapes.insert({reduced_tensor, tensor_attrs.shape}); + } + + // gradient tensor allocation + if (tensor_attrs.create_gradients == CreateGrad::YES && + !contains_key(this->gradient_tensor_lowering_mapping, tensor)) { + lowered_tensor_t reduced_tensor = + this->lowered_tensor_source.new_lowered_tensor(); + this->gradient_tensor_lowering_mapping.insert({tensor, reduced_tensor}); + RealmRegion region = + allocator.allocate(get_size_in_bytes(tensor_attrs.shape)); + this->tensor_regions.insert({reduced_tensor, region}); + this->tensor_shapes.insert({reduced_tensor, tensor_attrs.shape}); + } + } +} + +void RealmTensorBacking::allocate_optimizer_tensors( + tensor_guid_t const &weight, + std::vector const &optimizer_tensors, + RealmAllocator &allocator) { + GenericTensorAccessorW weight_backing = + this->get_tensor_backing(this->tensor_lowering_mapping.at(weight)); + for (optimizer_tensor_t const &optimizer_tensor : optimizer_tensors) { + // optimizer tensor allocation + if (!contains_key(this->optimizer_tensor_lowering_mapping, + optimizer_tensor)) { + lowered_tensor_t buffer_tensor = + this->lowered_tensor_source.new_lowered_tensor(); + this->optimizer_tensor_lowering_mapping.insert( + {optimizer_tensor, buffer_tensor}); + TensorShape tensor_shape = + get_tensor_shape(weight_backing.shape, weight_backing.data_type); + RealmRegion region = allocator.allocate(get_size_in_bytes(tensor_shape)); + this->tensor_regions.insert({buffer_tensor, region}); + this->tensor_shapes.insert({buffer_tensor, tensor_shape}); + } + } +} + +bool RealmTensorBacking::is_tensor_allocated( + lowered_tensor_t const &tensor_id) const { + return contains_key(tensor_regions, tensor_id); +} + +GenericTensorAccessorW const &RealmTensorBacking::get_tensor_backing( + lowered_tensor_t const &tensor_id) const { + void *ptr = this->tensor_regions.at(tensor_id).instance.pointer_untyped(0, 0); + TensorShape shape = this->tensor_shapes.at(tensor_id); + return {shape.data_type, ArrayShape{shape}, ptr}; +} + +TensorSlotsBacking RealmTensorBacking::construct_tensor_slots_backing( + TaskBinding const &binding) const { + TensorSlotsBacking mapping; + + for (auto const &tensor_binding : binding.get_tensor_bindings()) { + SlotTensorTypeId slot_tensor_type_id = tensor_binding.first; + + lowered_tensor_t tensor_id = [&] { + TensorTypeVariant tensor_type = tensor_binding.second; + if (tensor_type.has() and + slot_tensor_type_id.tensor_type == TensorType::FORWARD) { + return this->tensor_lowering_mapping.at( + tensor_type.get()); + } else if (tensor_type.has() and + slot_tensor_type_id.tensor_type == TensorType::GRADIENT) { + return this->gradient_tensor_lowering_mapping.at( + tensor_type.get()); + } else if (tensor_type.has()) { + return this->optimizer_tensor_lowering_mapping.at( + tensor_type.get()); + } else if (tensor_type.has()) { + return this->loss_tensor_lowering_mapping.at( + tensor_type.get()); + } else { + throw mk_runtime_error(fmt::format("Tensor binding has invalid type")); + } + }(); + + GenericTensorAccessorW accessor = this->get_tensor_backing(tensor_id); + mapping.insert({slot_tensor_type_id, accessor}); + } + + return mapping; +} + +} // namespace FlexFlow diff --git a/lib/realm-backend/src/realm_tensor_backing.cc b/lib/realm-backend/src/realm_tensor_backing.cc index 6edf6cf064..8f8f828821 100644 --- a/lib/realm-backend/src/realm_tensor_backing.cc +++ b/lib/realm-backend/src/realm_tensor_backing.cc @@ -1,124 +1,220 @@ -#include "realm-backend/realm_tensor_backing.h" -#include "local-execution/tensor_lowering.h" #include "op-attrs/parallel_tensor_shape.h" -#include "op-attrs/tensor_shape.dtg.h" +#include "op-attrs/tensor_shape.h" #include "pcg/computation_graph.h" +#include "pcg/optimizer_attrs.h" +#include "realm-backend/allocated_tensors.h" #include "realm-backend/realm_allocator.h" +#include "realm-backend/realm_tensor_backing.h" +#include "task-spec/slot_grad_id.dtg.h" #include "utils/containers/contains_key.h" +#include "utils/containers/keys.h" #include "utils/overload.h" -#include "local-execution/slot_grad_id.dtg.h" namespace FlexFlow { -RealmTensorBacking::RealmTensorBacking() {}; - -void RealmTensorBacking::allocate_layer_tensors( - layer_guid_t const &layer_guid, - ComputationGraph const &computation_graph, - RealmAllocator &allocator) { - this->allocate_tensors_by_role( - TensorRole::INPUT, layer_guid, computation_graph, allocator); - this->allocate_tensors_by_role( - TensorRole::WEIGHT, layer_guid, computation_graph, allocator); - this->allocate_tensors_by_role( - TensorRole::OUTPUT, layer_guid, computation_graph, allocator); +GenericTensorAccessorW wrappup_tensor_accessor( + std::pair const &tensor_region_shape) { + void *ptr = tensor_region_shape.first.instance.pointer_untyped(0, 0); + TensorShape shape = tensor_region_shape.second; + return {shape.data_type, ArrayShape{shape}, ptr}; } -void RealmTensorBacking::allocate_tensors_by_role( - TensorRole const &role, - layer_guid_t const &layer_guid, - ComputationGraph const &computation_graph, - RealmAllocator &allocator) { - std::vector tensors; - switch (role) { - case TensorRole::INPUT: - tensors = get_incoming_inputs(computation_graph, layer_guid); - break; - case TensorRole::WEIGHT: - tensors = get_incoming_weights(computation_graph, layer_guid); - break; - case TensorRole::OUTPUT: - tensors = get_outgoing_tensors(computation_graph, layer_guid); - break; - default: - throw mk_runtime_error("Invalid tensor role, got {}", role); +RealmTensorBacking::RealmTensorBacking( + AllocatedTensors const &allocated_tensors, + UnallocatedTensors const &unallocated_tensors, + RealmAllocator const &allocator) + : tensor_gradient_mapping(allocated_tensors.gradient_mapping), + tensor_optimizer_mapping(allocated_tensors.optimizer_mapping), + allocator(allocator) { + + // handle already-allocated tensors + for (std::pair> const + &tensor_type_backing : allocated_tensors.tensor_type_backings) { + lowered_tensor_t lowered_tensor = + this->insert_tensor(tensor_type_backing.first); + this->tensor_backings.insert({lowered_tensor, tensor_type_backing.second}); } - for (tensor_guid_t const &tensor : tensors) { - TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor); - // tensor allocation - if (!contains_key(this->tensor_lowering_mapping, tensor)) { - lowered_tensor_t reduced_tensor = this->lowered_tensor_source.new_lowered_tensor(); - this->tensor_lowering_mapping.insert({tensor, reduced_tensor}); - RealmRegion region = allocator.allocate(get_size_in_bytes(tensor_attrs.shape)); - this->tensor_regions.insert({reduced_tensor, region}); - this->tensor_shapes.insert({reduced_tensor, tensor_attrs.shape}); - } + // allocate new tensors + this->tensor_gradient_mapping.insert( + unallocated_tensors.gradient_mapping.begin(), + unallocated_tensors.gradient_mapping.end()); - // gradient tensor allocation - if (tensor_attrs.create_gradients == CreateGrad::YES && !contains_key(this->gradient_tensor_lowering_mapping, tensor)) { - lowered_tensor_t reduced_tensor = this->lowered_tensor_source.new_lowered_tensor(); - this->gradient_tensor_lowering_mapping.insert({tensor, reduced_tensor}); - RealmRegion region = allocator.allocate(get_size_in_bytes(tensor_attrs.shape)); - this->tensor_regions.insert({reduced_tensor, region}); - this->tensor_shapes.insert({reduced_tensor, tensor_attrs.shape}); + for (std::pair> const + &unallocated_optimizer_tensors : + unallocated_tensors.optimizer_mapping) { + if (this->tensor_optimizer_mapping.count( + unallocated_optimizer_tensors.first)) { + for (optimizer_tensor_t const &optimizer_tensor : + unallocated_optimizer_tensors.second) { + this->tensor_optimizer_mapping[unallocated_optimizer_tensors.first] + .push_back(optimizer_tensor); + } + } else { + this->tensor_optimizer_mapping.insert({unallocated_optimizer_tensors}); } } -} -void RealmTensorBacking::allocate_optimizer_tensors( - tensor_guid_t const &weight, - std::vector const& optimizer_tensors, - RealmAllocator &allocator) { - GenericTensorAccessorW weight_backing = this->get_tensor_backing(this->tensor_lowering_mapping.at(weight)); - for (optimizer_tensor_t const & optimizer_tensor: optimizer_tensors) { - // optimizer tensor allocation - if (!contains_key(this->optimizer_tensor_lowering_mapping, optimizer_tensor)) { - lowered_tensor_t buffer_tensor = this->lowered_tensor_source.new_lowered_tensor(); - this->optimizer_tensor_lowering_mapping.insert({optimizer_tensor, buffer_tensor}); - TensorShape tensor_shape = get_tensor_shape(weight_backing.shape, weight_backing.data_type); - RealmRegion region = allocator.allocate(get_size_in_bytes(tensor_shape)); - this->tensor_regions.insert({buffer_tensor, region}); - this->tensor_shapes.insert({buffer_tensor, tensor_shape}); - } + for (std::pair const &tensor_type_shape : + unallocated_tensors.tensor_type_shapes) { + lowered_tensor_t lowered_tensor = + this->insert_tensor(tensor_type_shape.first); + RealmRegion region = allocator.allocate( + get_size_in_bytes(tensor_type_shape.second).unwrap_nonnegative()); + this->tensor_backings.insert( + {lowered_tensor, {region, tensor_type_shape.second}}); } +}; + +lowered_tensor_t +RealmTensorBacking::insert_tensor(TensorTypeVariant const &tensor_type) { + lowered_tensor_t lowered_tensor = + this->lowered_tensor_source.new_lowered_tensor(); + tensor_type.visit(overload{ + [&](tensor_guid_t const &tensor_guid) { + this->tensor_lowering_mapping.insert({tensor_guid, lowered_tensor}); + return std::nullopt; + }, + [&](gradient_tensor_t const &gradient_tensor) { + this->gradient_tensor_lowering_mapping.insert( + {gradient_tensor, lowered_tensor}); + return std::nullopt; + }, + [&](optimizer_tensor_t const &optimizer_tensor) { + this->optimizer_tensor_lowering_mapping.insert( + {optimizer_tensor, lowered_tensor}); + return std::nullopt; + }, + [&](loss_tensor_t const &loss_tensor) { + this->loss_tensor_lowering_mapping.insert( + {loss_tensor, lowered_tensor}); + return std::nullopt; + }, + [&](auto const &any_tensor) { + throw mk_runtime_error( + fmt::format("Unhandled tensor type {}", any_tensor)); + }}); + return lowered_tensor; } -bool RealmTensorBacking::is_tensor_allocated(lowered_tensor_t const & tensor_id) const { - return contains_key(tensor_regions, tensor_id); +GenericTensorAccessorW +RealmTensorBacking::get_tensor(TensorTypeVariant const &tensor_type) const { + lowered_tensor_t lowered_tensor = + tensor_type.visit(overload{ + [&](tensor_guid_t const &tensor_guid) { + return this->tensor_lowering_mapping.at(tensor_guid); + }, + [&](gradient_tensor_t const &gradient_tensor) { + return this->gradient_tensor_lowering_mapping.at(gradient_tensor); + }, + [&](optimizer_tensor_t const &optimizer_tensor) { + return this->optimizer_tensor_lowering_mapping.at(optimizer_tensor); + }, + [&](loss_tensor_t const &loss_tensor) { + return this->loss_tensor_lowering_mapping.at(loss_tensor); + }, + [&](auto const &any_tensor) { + throw mk_runtime_error( + fmt::format("Unhandled tensor type {}", any_tensor)); + }}); + return wrappup_tensor_accessor(this->tensor_backings.at(lowered_tensor)); } -GenericTensorAccessorW const &RealmTensorBacking::get_tensor_backing( - lowered_tensor_t const &tensor_id) const { - void *ptr = this->tensor_regions.at(tensor_id).instance.pointer_untyped(0, 0); - TensorShape shape = this->tensor_shapes.at(tensor_id); - return {shape.data_type, ArrayShape{shape}, ptr}; +UnallocatedTensors generate_unallocated_tensors( + AllocatedTensors const &allocated_tensors, + std::unordered_map const &tensor_attrs_mapping, + GradientTensorSource &gradient_tensor_source) { + + assert(are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping)); + + std::unordered_map tensor_type_shapes; + std::unordered_map gradient_mapping; + + for (std::pair const &tensor_guid_attrs : + tensor_attrs_mapping) { + tensor_guid_t tensor_guid = tensor_guid_attrs.first; + TensorAttrs tensor_attrs = tensor_guid_attrs.second; + TensorTypeVariant tensor_guid_type = TensorTypeVariant{tensor_guid}; + if (!allocated_tensors.tensor_type_backings.count(tensor_guid_type)) { + tensor_type_shapes.insert({tensor_guid_type, tensor_attrs.shape}); + } + + if (tensor_attrs.create_gradients == CreateGrad::YES && + !allocated_tensors.gradient_mapping.count(tensor_guid)) { + gradient_tensor_t gradient_tensor = + gradient_tensor_source.new_gradient_tensor(); + tensor_type_shapes.insert( + {TensorTypeVariant{gradient_tensor}, tensor_attrs.shape}); + gradient_mapping.insert({tensor_guid, gradient_tensor}); + } + } + + return UnallocatedTensors{tensor_type_shapes, gradient_mapping, {}}; } -TensorSlotsBacking RealmTensorBacking::construct_tensor_slots_backing( - TaskBinding const &binding) const { - TensorSlotsBacking mapping; +UnallocatedTensors generate_unallocated_tensors_with_optimizer( + AllocatedTensors const &allocated_tensors, + std::unordered_map const &tensor_attrs_mapping, + GradientTensorSource &gradient_tensor_source, + OptimizerTensorSource &optimizer_tensor_source, + OptimizerAttrs const &optimizer_attrs) { + + UnallocatedTensors unallocated_tensors = generate_unallocated_tensors( + allocated_tensors, tensor_attrs_mapping, gradient_tensor_source); + + if (!get_num_optimizer_tensors(optimizer_attrs)) { + return unallocated_tensors; + } + + std::unordered_map tensor_type_shapes = + unallocated_tensors.tensor_type_shapes; + std::unordered_map gradient_mapping = + unallocated_tensors.gradient_mapping; + std::unordered_map> + optimizer_mapping; + + for (std::pair const &tensor_guid_attrs : + tensor_attrs_mapping) { + tensor_guid_t tensor_guid = tensor_guid_attrs.first; + TensorAttrs tensor_attrs = tensor_guid_attrs.second; + if (tensor_attrs.create_gradients == CreateGrad::YES) { + std::vector optimizer_tensors; + + int num_optimizer_tensors_to_allocate = + get_num_optimizer_tensors(optimizer_attrs); + if (allocated_tensors.optimizer_mapping.count(tensor_guid)) { + num_optimizer_tensors_to_allocate -= + allocated_tensors.optimizer_mapping.at(tensor_guid).size(); + } + std::cout << num_optimizer_tensors_to_allocate; - for (auto const &tensor_binding : binding.get_tensor_bindings()) { - SlotTensorTypeId slot_tensor_type_id = tensor_binding.first; - - lowered_tensor_t tensor_id = [&] { - TensorTypeVariant tensor_type = tensor_binding.second; - if (tensor_type.has() and slot_tensor_type_id.tensor_type == TensorType::FORWARD) { - return this->tensor_lowering_mapping.at(tensor_type.get()); - } else if (tensor_type.has() and slot_tensor_type_id.tensor_type == TensorType::GRADIENT) { - return this->gradient_tensor_lowering_mapping.at(tensor_type.get()); - } else if (tensor_type.has()) { - return this->optimizer_tensor_lowering_mapping.at(tensor_type.get()); - } else if (tensor_type.has()) { - return this->loss_tensor_lowering_mapping.at(tensor_type.get()); - } else { - throw mk_runtime_error(fmt::format("Tensor binding has invalid type")); + for (int i = 0; i < num_optimizer_tensors_to_allocate; ++i) { + optimizer_tensor_t optimizer_tensor = + optimizer_tensor_source.new_optimizer_tensor(); + optimizer_tensors.push_back(optimizer_tensor); + tensor_type_shapes.insert( + {TensorTypeVariant{optimizer_tensor}, tensor_attrs.shape}); } - }(); - GenericTensorAccessorW accessor = this->get_tensor_backing(tensor_id); - mapping.insert({slot_tensor_type_id, accessor}); + if (num_optimizer_tensors_to_allocate > 0) { + optimizer_mapping.insert({tensor_guid, optimizer_tensors}); + } + } + } + + return UnallocatedTensors{tensor_type_shapes, gradient_mapping, + optimizer_mapping}; +} + +TensorSlotsBacking +construct_tensor_slots_backing(RealmTensorBacking const &realm_tensor_backing, + TaskBinding const &binding) { + TensorSlotsBacking mapping; + + for (std::pair const &tensor_binding : + binding.get_tensor_bindings()) { + mapping.insert({tensor_binding.first, + realm_tensor_backing.get_tensor(tensor_binding.second)}); } return mapping; diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc index 24829a77b1..f6b516e303 100644 --- a/lib/realm-backend/src/realm_training_backing.cc +++ b/lib/realm-backend/src/realm_training_backing.cc @@ -1,30 +1,34 @@ #include "local-execution/loss_functions.h" #include "local-execution/optimizer.h" -#include "local-execution/task_id_t.dtg.h" -#include "local-execution/task_invocation.h" #include "local-execution/task_signature_impl.h" -#include "local-execution/tensor_lowering.h" +#include "pcg/computation_graph.dtg.h" #include "pcg/computation_graph.h" #include "pcg/optimizer_attrs.h" -#include "realm-backend/realm_training_backing.h" -#include "realm-backend/task_result.h" -#include "realm-backend/task_wrapper.h" +#include "realm-backend/realm_tensor_backing.h" +#include "task-spec/op_task_to_task_invocation.h" +#include "task-spec/runtime_arg_config.h" +#include "task-spec/task_invocation.h" #include "utils/containers/contains.h" #include "utils/containers/contains_key.h" #include "utils/containers/get_only.h" #include "utils/containers/values.h" #include "utils/exception.h" +#include "realm-backend/realm_training_backing.h" +#include "realm-backend/task_result.h" +#include "realm-backend/task_wrapper.h" + namespace FlexFlow { using namespace Realm; RealmTrainingBacking::RealmTrainingBacking( + Processor master_proc, AllocatedTensors const &allocated_tensors, ComputationGraph const &computation_graph, - RuntimeArgConfig const &runtime_arg_config, Realm::Processor master_proc) + RuntimeArgConfig const &runtime_arg_config) : computation_graph(computation_graph), - realm_args_backing(runtime_arg_config), - task_registry(empty_task_registry()) { + task_registry(construct_task_registry( + get_layer_attrs_mapping(this->computation_graph)))) { master_proc = master_proc; proc_events.insert({master_proc, Realm::Event::NO_EVENT}); master_mem = Machine::MemoryQuery(Machine::get_machine()) @@ -36,95 +40,212 @@ RealmTrainingBacking::RealmTrainingBacking( for (Processor p : pq) { worker_procs.push_back(p); proc_events.insert({p, Realm::Event::NO_EVENT}); - allocators.push_back(RealmAllocator(p)); + allocators.push_back(RealmAllocator::create(p)); } assert(worker_procs.size() > 0); -} -void RealmTrainingBacking::register_and_allocate_layer( - layer_guid_t const &node) { - ComputationGraphOpAttrs attrs = - get_layer_attrs(this->computation_graph, node).attrs; - this->realm_tensor_backing.allocate_layer_tensors( - node, this->computation_graph, this->allocators[0]); - register_tasks_for_layer(this->task_registry, node, attrs); - // TODO: multi gpu launching - std::vector task_ids = get_task_ids(attrs); - for (task_id_t task_id : task_ids) { - TaskSignatureAndImpl task_signature_impl = - this->task_registry.task_mapping.at(task_id); - register_wrapper_tasks(worker_procs[0], task_id, task_signature_impl); + // register tasks for realm + for (layer_guid_t const &node : + topological_ordering(this->computation_graph)) { + ComputationGraphOpAttrs attrs = + get_layer_attrs(this->computation_graph, node).attrs; + if (attrs.has()) { + OpTaskInvocation op_task_invocation = attrs.get(); + std::vector task_ids = get_task_ids(attrs); + for (task_id_t task_id : task_ids) { + TaskSignatureAndImpl task_signature_impl = + this->task_registry.task_mapping.at(task_id); + // TODO: multi gpu + register_wrapper_tasks(worker_procs[0], task_id, task_signature_impl); + } + } } + + // TODO: multi gpu + realm_tensor_backing = RealmTensorBacking( + allocated_tensors, + generate_unallocated_tensors( + allocated_tensors, get_all_tensor_attrs(this->computation_graph), + this->gradient_tensor_source), + allocators[0]); + realm_args_backing = + initialize_args_backing(this->task_registry, this->computation_graph, + runtime_arg_config, this->realm_tensor_backing); } -void RealmTrainingBacking::allocate_layer_optimizer_tensors( - layer_guid_t const &node, OptimizerAttrs const &optimizer_attrs) { - ComputationGraphOpAttrs attrs = - get_layer_attrs(this->computation_graph, node).attrs; - if (attrs.has()) { - TaskSignature sig = get_update_signature(optimizer_attrs); - tensor_guid_t weight_tensor = - get_only(get_outgoing_tensors(this->computation_graph, node)); +RealmTrainingBacking::RealmTrainingBacking( + Processor master_proc, AllocatedTensors const &allocated_tensors, + ComputationGraph const &computation_graph, + RuntimeArgConfig const &runtime_arg_config, + OptimizerAttrs const &optimizer_attrs) + : computation_graph(computation_graph), + task_registry(construct_task_registry( + get_layer_attrs_mapping(this->computation_graph)))) { + master_proc = master_proc; + proc_events.insert({master_proc, Realm::Event::NO_EVENT}); + master_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::SYSTEM_MEM) + .best_affinity_to(master_proc) + .first(); + Machine::ProcessorQuery pq = Machine::ProcessorQuery(Machine::get_machine()) + .only_kind(Processor::TOC_PROC); + for (Processor p : pq) { + worker_procs.push_back(p); + proc_events.insert({p, Realm::Event::NO_EVENT}); + allocators.push_back(RealmAllocator::create(p)); + } + assert(worker_procs.size() > 0); - std::vector optimizer_tensors; - for (TensorTypeSlotSpec const &tensor_type_slot_spec : - values(sig.tensor_guid_slots)) { - optimizer_tensors.push_back( - this->optimizer_tensor_source.new_optimizer_tensor()); + // register tasks for realm + for (layer_guid_t const &node : + topological_ordering(this->computation_graph)) { + ComputationGraphOpAttrs attrs = + get_layer_attrs(this->computation_graph, node).attrs; + if (attrs.has()) { + OpTaskInvocation op_task_invocation = attrs.get(); + std::vector task_ids = get_task_ids(attrs); + for (task_id_t task_id : task_ids) { + TaskSignatureAndImpl task_signature_impl = + this->task_registry.task_mapping.at(task_id); + // TODO: multi gpu + register_wrapper_tasks(worker_procs[0], task_id, task_signature_impl); + } } - this->layer_optimizer_tensor_ids.insert({node, optimizer_tensors}); - this->realm_tensor_backing.allocate_optimizer_tensors( - weight_tensor, optimizer_tensors, this->allocators[0]); } + + // TODO: multi gpu + realm_tensor_backing = RealmTensorBacking( + allocated_tensors, + generate_unallocated_tensors_with_optimizer( + allocated_tensors, get_all_tensor_attrs(this->computation_graph), + this->gradient_tensor_source, this->optimizer_tensor_source, + optimizer_attrs), + allocators[0]); + realm_args_backing = initialize_args_backing(this, runtime_arg_config); } -void RealmTrainingBacking::execute_init(layer_guid_t const &operator_node) { - if (registry_contains_task_for_layer(this->task_registry, operator_node, - OpTaskType::INIT)) { - ComputationGraphOpAttrs attrs = - get_layer_attrs(this->computation_graph, operator_node).attrs; - TaskInvocation invocation = - this->lower_to_task_invocation(init(attrs), operator_node); - TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation); - task_id_t task_id = invocation.task_id; - TaskImplFunction impl_function = - this->task_registry.task_mapping.at(task_id).impl_function; - // TODO: multi gpu launching - Promise promise(master_mem); - Future future = promise.get_future(); - RealmTaskArgs args{ - task_id, impl_function, accessor, std::move(promise)}; - Event e = worker_procs[0].spawn(static_cast(task_id), - &args, sizeof(args), - proc_events[worker_procs[0]]); - proc_events[worker_procs[0]] = e; - future.set_event(e); - this->realm_args_backing.add_per_device_op_state(operator_node, - std::move(future.get())); +RealmArgsBacking +initialize_args_backing(RealmTrainingBacking *backing, + RuntimeArgConfig const &runtime_arg_config) { + // initialize_args_backing(TaskRegistry const &task_registry, + // ComputationGraph const &cg, + // RuntimeArgConfig const &runtime_arg_config, + // RealmTensorBacking const &realm_tensor_backing) { + std::unordered_map + per_device_op_states; + TaskRegistry const &task_registry = backing->task_registry; + ComputationGraph const &cg = backing->computation_graph; + RealmTensorBacking const &realm_tensor_backing = + backing->realm_tensor_backing; + Processor master_proc = backing->master_proc; + Memory master_mem = backing->master_mem; + std::vector &worker_procs = backing->worker_procs; + std::unordered_map &proc_events = backing->proc_events; + + for (layer_guid_t const &node : topological_ordering(cg)) { + if (registry_contains_task_for_layer(task_registry, node, + OpTaskType::INIT)) { + ComputationGraphOpAttrs attrs = get_layer_attrs(cg, node).attrs; + + TaskInvocation invocation = lower_to_task_invocation( + init(attrs), node, get_incoming_inputs(cg, node), + get_incoming_input_shapes(cg, node), get_outgoing_tensors(cg, node), + get_incoming_weights(cg, node), + realm_tensor_backing.tensor_gradient_mapping, std::nullopt); + TaskArgumentAccessor accessor = get_task_arg_accessor( + realm_tensor_backing, + make_args_backing_with_empty_device_states(runtime_arg_config), + invocation); + task_id_t task_id = invocation.task_id; + TaskImplFunction impl_function = + task_registry.task_mapping.at(task_id).impl_function; + // TODO: multi gpu launching + Promise promise(master_mem); + Future future = promise.get_future(); + RealmTaskArgs args{ + task_id, impl_function, accessor, std::move(promise)}; + Event e = worker_procs[0].spawn( + static_cast(task_id), &args, sizeof(args), + proc_events[worker_procs[0]]); + proc_events[worker_procs[0]] = e; + future.set_event(e); + per_device_op_states.insert({node, std::move(future.get())}); + } } + + return RealmArgsBacking{runtime_arg_config, per_device_op_states}; } +// void RealmTrainingBacking::register_and_allocate_layer( +// layer_guid_t const &node) { +// ComputationGraphOpAttrs attrs = +// get_layer_attrs(this->computation_graph, node).attrs; +// this->realm_tensor_backing.allocate_layer_tensors( +// node, this->computation_graph, this->allocators[0]); +// } + +// void RealmTrainingBacking::allocate_layer_optimizer_tensors( +// layer_guid_t const &node, OptimizerAttrs const &optimizer_attrs) { +// ComputationGraphOpAttrs attrs = +// get_layer_attrs(this->computation_graph, node).attrs; +// if (attrs.has()) { +// TaskSignature sig = get_update_signature(optimizer_attrs); +// tensor_guid_t weight_tensor = +// get_only(get_outgoing_tensors(this->computation_graph, node)); + +// std::vector optimizer_tensors; +// for (TensorTypeSlotSpec const &tensor_type_slot_spec : +// values(sig.tensor_guid_slots)) { +// optimizer_tensors.push_back( +// this->optimizer_tensor_source.new_optimizer_tensor()); +// } +// this->layer_optimizer_tensor_ids.insert({node, optimizer_tensors}); +// this->realm_tensor_backing.allocate_optimizer_tensors( +// weight_tensor, optimizer_tensors, this->allocators[0]); +// } +// } + Future> -RealmTrainingBacking::execute_forward(layer_guid_t const &operator_node) { - if (registry_contains_task_for_layer(this->task_registry, operator_node, - OpTaskType::FWD)) { +execute_forward(RealmTrainingBacking &realm_training_backing, + layer_guid_t const &operator_node) { + if (registry_contains_task_for_layer(realm_training_backing.task_registry, + operator_node, OpTaskType::FWD)) { ComputationGraphOpAttrs attrs = - get_layer_attrs(this->computation_graph, operator_node).attrs; - TaskInvocation invocation = - this->lower_to_task_invocation(forward(attrs), operator_node); - TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation); + get_layer_attrs(realm_training_backing.computation_graph, operator_node) + .attrs; + std::optional device_state = + get_per_device_op_state_if_exists( + realm_training_backing.realm_args_backing, operator_node); + TaskInvocation invocation = lower_to_task_invocation( + forward(attrs), operator_node, + get_incoming_inputs(realm_training_backing.computation_graph, + operator_node), + get_incoming_input_shapes(realm_training_backing.computation_graph, + operator_node), + get_outgoing_tensors(realm_training_backing.computation_graph, + operator_node), + get_incoming_weights(realm_training_backing.computation_graph, + operator_node), + realm_training_backing.realm_tensor_backing.tensor_gradient_mapping, + device_state); + TaskArgumentAccessor accessor = get_task_arg_accessor( + realm_training_backing.realm_tensor_backing, + realm_training_backing.realm_args_backing, invocation); task_id_t task_id = invocation.task_id; TaskImplFunction impl_function = - this->task_registry.task_mapping.at(task_id).impl_function; + realm_training_backing.task_registry.task_mapping.at(task_id) + .impl_function; // TODO: multi gpu launching - Promise> promise(master_mem); + Promise> promise(realm_training_backing.master_mem); Future> future = promise.get_future(); RealmTaskArgs> args{task_id, impl_function, accessor, std::move(promise)}; - Event e = worker_procs[0].spawn(static_cast(task_id), - &args, sizeof(args), - proc_events[worker_procs[0]]); - proc_events[worker_procs[0]] = e; + Event e = realm_training_backing.worker_procs[0].spawn( + static_cast(task_id), &args, sizeof(args), + realm_training_backing + .proc_events[realm_training_backing.worker_procs[0]]); + realm_training_backing.proc_events[realm_training_backing.worker_procs[0]] = + e; future.set_event(e); return future; } else { @@ -133,26 +254,46 @@ RealmTrainingBacking::execute_forward(layer_guid_t const &operator_node) { } Future> -RealmTrainingBacking::execute_backward(layer_guid_t const &operator_node) { - if (registry_contains_task_for_layer(this->task_registry, operator_node, - OpTaskType::BWD)) { +execute_backward(RealmTrainingBacking &realm_training_backing, + layer_guid_t const &operator_node) { + if (registry_contains_task_for_layer(realm_training_backing.task_registry, + operator_node, OpTaskType::BWD)) { ComputationGraphOpAttrs attrs = - get_layer_attrs(this->computation_graph, operator_node).attrs; - TaskInvocation invocation = - this->lower_to_task_invocation(backward(attrs), operator_node); - TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation); + get_layer_attrs(realm_training_backing.computation_graph, operator_node) + .attrs; + std::optional device_state = + get_per_device_op_state_if_exists( + realm_training_backing.realm_args_backing, operator_node); + TaskInvocation invocation = lower_to_task_invocation( + forward(attrs), operator_node, + get_incoming_inputs(realm_training_backing.computation_graph, + operator_node), + get_incoming_input_shapes(realm_training_backing.computation_graph, + operator_node), + get_outgoing_tensors(realm_training_backing.computation_graph, + operator_node), + get_incoming_weights(realm_training_backing.computation_graph, + operator_node), + realm_training_backing.realm_tensor_backing.tensor_gradient_mapping, + device_state); + TaskArgumentAccessor accessor = get_task_arg_accessor( + realm_training_backing.realm_tensor_backing, + realm_training_backing.realm_args_backing, invocation); task_id_t task_id = invocation.task_id; TaskImplFunction impl_function = - this->task_registry.task_mapping.at(task_id).impl_function; + realm_training_backing.task_registry.task_mapping.at(task_id) + .impl_function; // TODO: multi gpu launching - Promise> promise(master_mem); + Promise> promise(realm_training_backing.master_mem); Future> future = promise.get_future(); RealmTaskArgs> args{task_id, impl_function, accessor, std::move(promise)}; - Event e = worker_procs[0].spawn(static_cast(task_id), - &args, sizeof(args), - proc_events[worker_procs[0]]); - proc_events[worker_procs[0]] = e; + Event e = realm_training_backing.worker_procs[0].spawn( + static_cast(task_id), &args, sizeof(args), + realm_training_backing + .proc_events[realm_training_backing.worker_procs[0]]); + realm_training_backing.proc_events[realm_training_backing.worker_procs[0]] = + e; future.set_event(e); return future; } else { @@ -160,34 +301,50 @@ RealmTrainingBacking::execute_backward(layer_guid_t const &operator_node) { } } -Future -RealmTrainingBacking::execute_update(layer_guid_t const &node, - OptimizerAttrs const &optimizer_attrs) { - LayerAttrs layer_attrs = get_layer_attrs(this->computation_graph, node); +Future execute_update(RealmTrainingBacking const &realm_training_backing, + layer_guid_t const &node, + OptimizerAttrs const &optimizer_attrs) { + LayerAttrs layer_attrs = + get_layer_attrs(realm_training_backing.computation_graph, node); if (layer_attrs.attrs.has()) { // get tensors - tensor_guid_t weight_tensor = - get_only(get_outgoing_tensors(this->computation_graph, node)); + tensor_guid_t weight_tensor = get_only( + get_outgoing_tensors(realm_training_backing.computation_graph, node)); + + gradient_tensor_t weight_grad_tensor = + realm_training_backing.realm_tensor_backing.tensor_gradient_mapping.at( + weight_tensor); std::vector optimizer_buffer_tensors = - this->layer_optimizer_tensor_ids.at(node); + realm_training_backing.realm_tensor_backing.tensor_optimizer_mapping.at( + weight_tensor); + // get invocation - TaskInvocation invocation = get_update_invocation( - optimizer_attrs, weight_tensor, optimizer_buffer_tensors); + TaskInvocation invocation = + get_update_invocation(optimizer_attrs, weight_tensor, + weight_grad_tensor, optimizer_buffer_tensors); + // TODO: https://github.com/flexflow/flexflow-train/issues/1442 // assert(is_invocation_valid(get_update_signature(attrs), invocation)); - TaskArgumentAccessor accessor = this->get_task_arg_accessor(invocation); + + // execute update + TaskArgumentAccessor accessor = get_task_arg_accessor( + realm_training_backing.realm_tensor_backing, + realm_training_backing.realm_args_backing, invocation); task_id_t task_id = invocation.task_id; - register_wrapper_tasks_generic(worker_procs[0], task_id); + register_wrapper_tasks_generic(realm_training_backing.worker_procs[0], + task_id); TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs); // TODO: multi gpu launching Promise promise; Future future = promise.get_future(); RealmTaskArgs args{task_id, update_impl_fn, accessor, std::move(promise)}; - Event e = worker_procs[0].spawn(static_cast(task_id), - &args, sizeof(args), - proc_events[worker_procs[0]]); - proc_events[worker_procs[0]] = e; + Event e = realm_training_backing.worker_procs[0].spawn( + static_cast(task_id), &args, sizeof(args), + realm_training_backing + .proc_events[realm_training_backing.worker_procs[0]]); + realm_training_backing.proc_events[realm_training_backing.worker_procs[0]] = + e; future.set_event(e); return future; } else { @@ -195,98 +352,50 @@ RealmTrainingBacking::execute_update(layer_guid_t const &node, } } -Future -RealmTrainingBacking::compute_loss(LossAttrs const &loss_attrs, - tensor_guid_t const &logit_tensor, - loss_tensor_t const &label_tensor) { - TaskInvocation loss_invocation = - backward(loss_attrs, logit_tensor, label_tensor); +Future compute_loss(RealmTrainingBacking const &realm_training_backing, + LossAttrs const &loss_attrs, + tensor_guid_t const &logit_tensor, + loss_tensor_t const &label_tensor) { + TaskInvocation loss_invocation = backward( + loss_attrs, logit_tensor, + realm_training_backing.realm_tensor_backing.tensor_gradient_mapping.at( + logit_tensor), + label_tensor); // TODO: https://github.com/flexflow/flexflow-train/issues/1442 // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation)); - TaskArgumentAccessor loss_accessor = - this->get_task_arg_accessor(loss_invocation); + TaskArgumentAccessor loss_accessor = get_task_arg_accessor( + realm_training_backing.realm_tensor_backing, + realm_training_backing.realm_args_backing, loss_invocation); task_id_t task_id = loss_invocation.task_id; - register_wrapper_tasks_generic(worker_procs[0], task_id); + register_wrapper_tasks_generic(realm_training_backing.worker_procs[0], + task_id); TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl(); // TODO: multi gpu launching Promise promise; Future future = promise.get_future(); RealmTaskArgs args{task_id, loss_impl_fn, loss_accessor, std::move(promise)}; - Event e = - worker_procs[0].spawn(static_cast(task_id), &args, - sizeof(args), proc_events[worker_procs[0]]); - proc_events[worker_procs[0]] = e; + Event e = realm_training_backing.worker_procs[0].spawn( + static_cast(task_id), &args, sizeof(args), + realm_training_backing + .proc_events[realm_training_backing.worker_procs[0]]); + realm_training_backing.proc_events[realm_training_backing.worker_procs[0]] = + e; future.set_event(e); return future; } -TaskArgumentAccessor RealmTrainingBacking::get_task_arg_accessor( - TaskInvocation const &invocation) const { +TaskArgumentAccessor +get_task_arg_accessor(RealmTensorBacking const &realm_tensor_backing, + RealmArgsBacking const &realm_args_backing, + TaskInvocation const &invocation) { TensorSlotsBacking tensor_slots_backing = - this->realm_tensor_backing.construct_tensor_slots_backing( - invocation.binding); - ArgSlotsBacking arg_slots_backing = - this->realm_args_backing.construct_arg_slots_backing(invocation.binding); + construct_tensor_slots_backing(realm_tensor_backing, invocation.binding); + ArgSlotsBacking arg_slots_backing = construct_arg_slots_backing( + invocation.binding, realm_args_backing.runtime_arg_config); + // TODO: multi gpu return TaskArgumentAccessor::create( - this->allocators[0], tensor_slots_backing, arg_slots_backing); -} - -TaskInvocation RealmTrainingBacking::lower_to_task_invocation( - OpTaskInvocation const &op_task_invocation, - layer_guid_t const &layer_guid) const { - TaskBinding binding; - // tensors - for (auto const &tensor_binding : - op_task_invocation.binding.get_tensor_bindings()) { - tensor_guid_t tensor_to_bind = [&]() -> tensor_guid_t { - switch (tensor_binding.second.role) { - case TensorRole::INPUT: - return get_incoming_inputs(this->computation_graph, layer_guid) - .at(tensor_binding.second.idx); - case TensorRole::OUTPUT: - return get_outgoing_tensors(this->computation_graph, layer_guid) - .at(tensor_binding.second.idx); - case TensorRole::WEIGHT: - return get_incoming_weights(this->computation_graph, layer_guid) - .at(tensor_binding.second.idx); - default: - throw mk_runtime_error( - fmt::format("Invalid tensor role {}", tensor_binding.second.role)); - } - }(); - - if (tensor_binding.first.is_grad == IsGrad::NO) { - binding.bind(tensor_binding.first.slot_id, tensor_to_bind); - } else if (tensor_binding.first.is_grad == IsGrad::YES) { - binding.bind_grad(tensor_binding.first.slot_id, tensor_to_bind); - } else { - throw mk_runtime_error(fmt::format("Invalid value for IsGrad {}", - tensor_binding.first.is_grad)); - } - } - - // args - for (auto const &arg_binding : - op_task_invocation.binding.get_arg_bindings()) { - if (arg_binding.second.has()) { - ConcreteArgSpec concrete_arg = - this->realm_args_backing.lower_to_concrete_arg_spec( - arg_binding.second.get(), this->computation_graph, - layer_guid); - binding.insert_arg_spec(arg_binding.first, TaskArgSpec{concrete_arg}); - } else if (arg_binding.second.has()) { - binding.insert_arg_spec( - arg_binding.first, - TaskArgSpec{arg_binding.second.get()}); - } else { - binding.insert_arg_spec( - arg_binding.first, - TaskArgSpec{arg_binding.second.get()}); - } - } - - return TaskInvocation{op_task_invocation.task_id, binding}; + realm_tensor_backing.allocator, tensor_slots_backing, arg_slots_backing); } } // namespace FlexFlow From 89752fa904e5112f735f8957e1d7beb3bf1995f5 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Fri, 21 Feb 2025 21:28:39 -0800 Subject: [PATCH 47/91] Move local tensor backing to dtgen --- .../local_task_argument_accessor.h | 9 - .../local-execution/local_tensor_backing.h | 65 ++-- .../local_tensor_backing.struct.toml | 34 ++ .../local-execution/local_training_backing.h | 22 +- .../local-execution/model_training_instance.h | 13 +- .../local-execution/unallocated_tensors.h | 27 ++ .../src/local_cost_estimator.cc | 6 +- .../src/local_task_argument_accessor.cc | 31 -- .../src/local_tensor_backing.cc | 225 +++---------- .../src/local_training_backing.cc | 76 +++-- .../src/model_training_instance.cc | 49 ++- .../src/unallocated_tensors.cc | 93 ++++++ lib/local-execution/test/CMakeLists.txt | 1 + .../test/src/test_local_slots_backing.cc | 309 ------------------ .../test/src/test_local_tensor_backing.cc | 152 +++++++++ .../test/src/test_task_registry.cc | 69 ++++ .../test/src/test_unallocated_tensors.cc | 2 +- 17 files changed, 546 insertions(+), 637 deletions(-) create mode 100644 lib/local-execution/include/local-execution/local_tensor_backing.struct.toml create mode 100644 lib/local-execution/include/local-execution/unallocated_tensors.h create mode 100644 lib/local-execution/src/unallocated_tensors.cc delete mode 100644 lib/local-execution/test/src/test_local_slots_backing.cc create mode 100644 lib/local-execution/test/src/test_local_tensor_backing.cc diff --git a/lib/local-execution/include/local-execution/local_task_argument_accessor.h b/lib/local-execution/include/local-execution/local_task_argument_accessor.h index b1e5a02985..c46534330b 100644 --- a/lib/local-execution/include/local-execution/local_task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/local_task_argument_accessor.h @@ -39,15 +39,6 @@ struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor { ArgSlotsBacking arg_slots_backing; }; -using TensorSlotsBackingWithoutAddresses = std::unordered_map< - SlotTensorTypeId, - std::variant, - std::vector>>>; - -TensorSlotsBackingWithoutAddresses - get_slots_backing_without_tensor_allocation_addresses( - TensorSlotsBacking const &); - CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalTaskArgumentAccessor); } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/local_tensor_backing.h b/lib/local-execution/include/local-execution/local_tensor_backing.h index c05e39beae..70a2474159 100644 --- a/lib/local-execution/include/local-execution/local_tensor_backing.h +++ b/lib/local-execution/include/local-execution/local_tensor_backing.h @@ -6,6 +6,7 @@ #include "local-execution/allocated_tensors.dtg.h" #include "local-execution/gradient_tensor_source.h" #include "local-execution/local_task_argument_accessor.h" +#include "local-execution/local_tensor_backing.dtg.h" #include "local-execution/loss_tensor_source.h" #include "local-execution/lowered_tensor_source.h" #include "local-execution/optimizer_tensor_source.h" @@ -19,51 +20,25 @@ namespace FlexFlow { -using TensorBackingMap = - std::unordered_map; - -struct LocalTensorBacking { - LocalTensorBacking(AllocatedTensors const &, - UnallocatedTensors const &, - Allocator const &); - -public: - GenericTensorAccessorW get_tensor(TensorTypeVariant const &) const; - -public: - // tensors - TensorBackingMap tensor_backings; - - std::unordered_map tensor_lowering_mapping; - std::unordered_map - gradient_tensor_lowering_mapping; - std::unordered_map - optimizer_tensor_lowering_mapping; - std::unordered_map - loss_tensor_lowering_mapping; - - std::unordered_map tensor_gradient_mapping; - std::unordered_map> - tensor_optimizer_mapping; - - Allocator allocator; - -private: - lowered_tensor_t insert_tensor(TensorTypeVariant const &); - LoweredTensorSource lowered_tensor_source; -}; - -UnallocatedTensors generate_unallocated_tensors( - AllocatedTensors const &, - std::unordered_map const &, - GradientTensorSource &); - -UnallocatedTensors generate_unallocated_tensors_with_optimizer( - AllocatedTensors const &, - std::unordered_map const &, - GradientTensorSource &, - OptimizerTensorSource &, - OptimizerAttrs const &); +GenericTensorAccessorW get_tensor(LocalTensorBacking const &, + TensorTypeVariant const &); + +std::unordered_map + get_tensor_backings( + std::unordered_map const &, + std::unordered_map const &, + Allocator &); + +std::unordered_map> + merge_optimizer_mappings( + std::unordered_map> const + &allocated, + std::unordered_map> const + &unallocated); + +LocalTensorBacking construct_local_tensor_backing(AllocatedTensors const &, + UnallocatedTensors const &, + Allocator &); TensorSlotsBacking construct_tensor_slots_backing(LocalTensorBacking const &, TaskBinding const &); diff --git a/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml b/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml new file mode 100644 index 0000000000..c34063af5d --- /dev/null +++ b/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml @@ -0,0 +1,34 @@ +namespace = "FlexFlow" +name = "LocalTensorBacking" +features = [ + "eq", + "fmt", + "hash" +] + +includes = [ + "task-spec/tensor_type_t.dtg.h", + "kernels/accessor.h", + "pcg/tensor_guid_t.dtg.h", + "task-spec/gradient_tensor_t.dtg.h", + "task-spec/optimizer_tensor_t.dtg.h", +] + +src_includes = [ + "utils/hash/unordered_map.h", + "utils/fmt/unordered_map.h", + "utils/hash/vector.h", + "utils/fmt/vector.h", +] + +[[fields]] +name = "tensor_backings" +type = "std::unordered_map<::FlexFlow::TensorTypeVariant, ::FlexFlow::GenericTensorAccessorW>" + +[[fields]] +name = "tensor_gradient_mapping" +type = "std::unordered_map<::FlexFlow::tensor_guid_t, ::FlexFlow::gradient_tensor_t>" + +[[fields]] +name = "tensor_optimizer_mapping" +type = "std::unordered_map<::FlexFlow::tensor_guid_t, std::vector<::FlexFlow::optimizer_tensor_t>>" diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h index b61d20c232..8c2bb34130 100644 --- a/lib/local-execution/include/local-execution/local_training_backing.h +++ b/lib/local-execution/include/local-execution/local_training_backing.h @@ -13,12 +13,12 @@ namespace FlexFlow { struct LocalTrainingBacking { - LocalTrainingBacking(Allocator const &, + LocalTrainingBacking(Allocator &, AllocatedTensors const &, ComputationGraph const &, RuntimeArgConfig const &); - LocalTrainingBacking(Allocator const &, + LocalTrainingBacking(Allocator &, AllocatedTensors const &, ComputationGraph const &, RuntimeArgConfig const &, @@ -38,27 +38,33 @@ struct LocalTrainingBacking { LocalArgsBacking initialize_args_backing(TaskRegistry const &, ComputationGraph const &, RuntimeArgConfig const &, - LocalTensorBacking const &); + LocalTensorBacking const &, + Allocator &); std::optional call_task_impl(TaskRegistry const &, task_id_t const &task_id, TaskArgumentAccessor const &acc); std::optional execute_forward(LocalTrainingBacking const &, - layer_guid_t const &); + layer_guid_t const &, + Allocator &); std::optional execute_backward(LocalTrainingBacking const &, - layer_guid_t const &); + layer_guid_t const &, + Allocator &); void compute_loss(LocalTrainingBacking const &, LossAttrs const &, tensor_guid_t const &logit_tensor, - loss_tensor_t const &label_tensor); + loss_tensor_t const &label_tensor, + Allocator &); void execute_update(LocalTrainingBacking const &, layer_guid_t const &, - OptimizerAttrs const &); + OptimizerAttrs const &, + Allocator &); TaskArgumentAccessor get_task_arg_accessor(LocalTensorBacking const &, LocalArgsBacking const &, - TaskInvocation const &); + TaskInvocation const &, + Allocator &); } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h index 99a1bd5a9a..b36b20ed04 100644 --- a/lib/local-execution/include/local-execution/model_training_instance.h +++ b/lib/local-execution/include/local-execution/model_training_instance.h @@ -12,22 +12,25 @@ using PerLayerElapsedTime = std::unordered_map>; struct ModelTrainingInstance { - ModelTrainingInstance(LocalTrainingBacking const &, + ModelTrainingInstance(Allocator const &, + LocalTrainingBacking const &, tensor_guid_t const &logit_tensor, loss_tensor_t const &label_tensor, LossAttrs const &, OptimizerAttrs const &); + Allocator allocator; LocalTrainingBacking training_backing; tensor_guid_t logit_tensor; loss_tensor_t label_tensor; LossAttrs loss_attrs; OptimizerAttrs optimizer_attrs; -}; -PerLayerElapsedTime forward(ModelTrainingInstance const &); -PerLayerElapsedTime backward(ModelTrainingInstance const &); -void update(ModelTrainingInstance &); +public: + PerLayerElapsedTime forward(); + PerLayerElapsedTime backward(); + void update(); +}; } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/unallocated_tensors.h b/lib/local-execution/include/local-execution/unallocated_tensors.h new file mode 100644 index 0000000000..63ead67589 --- /dev/null +++ b/lib/local-execution/include/local-execution/unallocated_tensors.h @@ -0,0 +1,27 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_UNALLOCATED_TENSORS_H +#define _FLEXFLOW_LOCAL_EXECUTION_UNALLOCATED_TENSORS_H + +#include "local-execution/allocated_tensors.dtg.h" +#include "local-execution/gradient_tensor_source.h" +#include "local-execution/optimizer_tensor_source.h" +#include "local-execution/unallocated_tensors.dtg.h" +#include "pcg/optimizer_attrs.dtg.h" +#include "pcg/tensor_attrs.dtg.h" + +namespace FlexFlow { + +UnallocatedTensors generate_unallocated_tensors( + AllocatedTensors const &, + std::unordered_map const &, + GradientTensorSource &); + +UnallocatedTensors generate_unallocated_tensors_with_optimizer( + AllocatedTensors const &, + std::unordered_map const &, + GradientTensorSource &, + OptimizerTensorSource &, + OptimizerAttrs const &); + +} // namespace FlexFlow + +#endif diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc index 41a5df8d48..7d05bb1e81 100644 --- a/lib/local-execution/src/local_cost_estimator.cc +++ b/lib/local-execution/src/local_cost_estimator.cc @@ -95,8 +95,10 @@ CostDetails LocalCostEstimator::estimate_cost( // execute layer layer_guid_t operator_layer_guid = get_layer_by_name(computation_graph, "operator"); - float fwd = execute_forward(local_backing, operator_layer_guid).value(); - float bwd = execute_backward(local_backing, operator_layer_guid).value(); + float fwd = + execute_forward(local_backing, operator_layer_guid, allocator).value(); + float bwd = + execute_backward(local_backing, operator_layer_guid, allocator).value(); float total_execution_time = fwd + bwd; diff --git a/lib/local-execution/src/local_task_argument_accessor.cc b/lib/local-execution/src/local_task_argument_accessor.cc index fb6db2ed98..e53e3abeff 100644 --- a/lib/local-execution/src/local_task_argument_accessor.cc +++ b/lib/local-execution/src/local_task_argument_accessor.cc @@ -57,37 +57,6 @@ Allocator LocalTaskArgumentAccessor::get_allocator() const { return this->allocator; } -TensorSlotsBackingWithoutAddresses - get_slots_backing_without_tensor_allocation_addresses( - TensorSlotsBacking const &slots_backing) { - - TensorSlotsBackingWithoutAddresses addressless_slots_backing; - - using TensorAccessorVariant = - std::variant>; - for (auto const &slot_tensor : slots_backing) { - TensorAccessorVariant accessor_variant = slot_tensor.second; - std::visit( - overload{ - [&](GenericTensorAccessorW const &accessor) { - addressless_slots_backing.insert( - {slot_tensor.first, get_shape_and_datatype(accessor)}); - }, - [&](std::vector const &variadic_accessor) { - std::vector> - variadic_addressless_accessor = - transform(variadic_accessor, - [](GenericTensorAccessorW const &accessor) { - return get_shape_and_datatype(accessor); - }); - addressless_slots_backing.insert( - {slot_tensor.first, variadic_addressless_accessor}); - }}, - accessor_variant); - } - return addressless_slots_backing; -} - size_t LocalTaskArgumentAccessor::get_device_idx() const { return 0; } diff --git a/lib/local-execution/src/local_tensor_backing.cc b/lib/local-execution/src/local_tensor_backing.cc index b5a0deaee4..629117508f 100644 --- a/lib/local-execution/src/local_tensor_backing.cc +++ b/lib/local-execution/src/local_tensor_backing.cc @@ -1,200 +1,81 @@ #include "local-execution/local_tensor_backing.h" -#include "task-spec/slot_grad_id.dtg.h" - -#include "local-execution/allocated_tensors.h" #include "op-attrs/parallel_tensor_shape.h" #include "pcg/computation_graph.h" #include "pcg/optimizer_attrs.h" +#include "task-spec/slot_grad_id.dtg.h" #include "utils/containers/contains_key.h" #include "utils/containers/keys.h" #include "utils/overload.h" namespace FlexFlow { -LocalTensorBacking::LocalTensorBacking( - AllocatedTensors const &allocated_tensors, - UnallocatedTensors const &unallocated_tensors, - Allocator const &allocator) - : tensor_gradient_mapping(allocated_tensors.gradient_mapping), - tensor_optimizer_mapping(allocated_tensors.optimizer_mapping), - allocator(allocator) { - - // handle already-allocated tensors - for (std::pair const - &tensor_type_backing : allocated_tensors.tensor_type_backings) { - lowered_tensor_t lowered_tensor = - this->insert_tensor(tensor_type_backing.first); - this->tensor_backings.insert({lowered_tensor, tensor_type_backing.second}); - } - - // allocate new tensors - this->tensor_gradient_mapping.insert( - unallocated_tensors.gradient_mapping.begin(), - unallocated_tensors.gradient_mapping.end()); +GenericTensorAccessorW + get_tensor(LocalTensorBacking const &local_tensor_backing, + TensorTypeVariant const &tensor_type) { + return local_tensor_backing.tensor_backings.at(tensor_type); +} +std::unordered_map> + merge_optimizer_mappings( + std::unordered_map> const + &allocated, + std::unordered_map> const + &unallocated) { + std::unordered_map> + merged_maps = allocated; for (std::pair> const - &unallocated_optimizer_tensors : - unallocated_tensors.optimizer_mapping) { - if (this->tensor_optimizer_mapping.count( - unallocated_optimizer_tensors.first)) { + &unallocated_optimizer_tensors : unallocated) { + if (merged_maps.count(unallocated_optimizer_tensors.first)) { for (optimizer_tensor_t const &optimizer_tensor : unallocated_optimizer_tensors.second) { - this->tensor_optimizer_mapping[unallocated_optimizer_tensors.first] - .push_back(optimizer_tensor); + merged_maps[unallocated_optimizer_tensors.first].push_back( + optimizer_tensor); } } else { - this->tensor_optimizer_mapping.insert({unallocated_optimizer_tensors}); + merged_maps.insert({unallocated_optimizer_tensors}); } } - - for (std::pair const &tensor_type_shape : - unallocated_tensors.tensor_type_shapes) { - lowered_tensor_t lowered_tensor = - this->insert_tensor(tensor_type_shape.first); - GenericTensorAccessorW tensor_backing = - this->allocator.allocate_tensor(tensor_type_shape.second); - this->tensor_backings.insert({lowered_tensor, tensor_backing}); - } -}; - -lowered_tensor_t - LocalTensorBacking::insert_tensor(TensorTypeVariant const &tensor_type) { - lowered_tensor_t lowered_tensor = - this->lowered_tensor_source.new_lowered_tensor(); - tensor_type.visit(overload{ - [&](tensor_guid_t const &tensor_guid) { - this->tensor_lowering_mapping.insert({tensor_guid, lowered_tensor}); - return std::nullopt; - }, - [&](gradient_tensor_t const &gradient_tensor) { - this->gradient_tensor_lowering_mapping.insert( - {gradient_tensor, lowered_tensor}); - return std::nullopt; - }, - [&](optimizer_tensor_t const &optimizer_tensor) { - this->optimizer_tensor_lowering_mapping.insert( - {optimizer_tensor, lowered_tensor}); - return std::nullopt; - }, - [&](loss_tensor_t const &loss_tensor) { - this->loss_tensor_lowering_mapping.insert( - {loss_tensor, lowered_tensor}); - return std::nullopt; - }, - [&](auto const &any_tensor) { - throw mk_runtime_error( - fmt::format("Unhandled tensor type {}", any_tensor)); - }}); - return lowered_tensor; + return merged_maps; } -GenericTensorAccessorW - LocalTensorBacking::get_tensor(TensorTypeVariant const &tensor_type) const { - lowered_tensor_t lowered_tensor = - tensor_type.visit(overload{ - [&](tensor_guid_t const &tensor_guid) { - return this->tensor_lowering_mapping.at(tensor_guid); - }, - [&](gradient_tensor_t const &gradient_tensor) { - return this->gradient_tensor_lowering_mapping.at(gradient_tensor); - }, - [&](optimizer_tensor_t const &optimizer_tensor) { - return this->optimizer_tensor_lowering_mapping.at(optimizer_tensor); - }, - [&](loss_tensor_t const &loss_tensor) { - return this->loss_tensor_lowering_mapping.at(loss_tensor); - }, - [&](auto const &any_tensor) { - throw mk_runtime_error( - fmt::format("Unhandled tensor type {}", any_tensor)); - }}); - return this->tensor_backings.at(lowered_tensor); -} - -UnallocatedTensors generate_unallocated_tensors( - AllocatedTensors const &allocated_tensors, - std::unordered_map const &tensor_attrs_mapping, - GradientTensorSource &gradient_tensor_source) { - - assert(are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping)); - - std::unordered_map tensor_type_shapes; - std::unordered_map gradient_mapping; +std::unordered_map + get_tensor_backings( + std::unordered_map const + &tensor_type_backings, + std::unordered_map const + &tensor_type_shapes, + Allocator &allocator) { + std::unordered_map + all_tensor_backings = tensor_type_backings; - for (std::pair const &tensor_guid_attrs : - tensor_attrs_mapping) { - tensor_guid_t tensor_guid = tensor_guid_attrs.first; - TensorAttrs tensor_attrs = tensor_guid_attrs.second; - TensorTypeVariant tensor_guid_type = TensorTypeVariant{tensor_guid}; - if (!allocated_tensors.tensor_type_backings.count(tensor_guid_type)) { - tensor_type_shapes.insert({tensor_guid_type, tensor_attrs.shape}); - } - - if (tensor_attrs.create_gradients == CreateGrad::YES && - !allocated_tensors.gradient_mapping.count(tensor_guid)) { - gradient_tensor_t gradient_tensor = - gradient_tensor_source.new_gradient_tensor(); - tensor_type_shapes.insert( - {TensorTypeVariant{gradient_tensor}, tensor_attrs.shape}); - gradient_mapping.insert({tensor_guid, gradient_tensor}); - } + // allocate new tensors + for (std::pair const &tensor_type_shape : + tensor_type_shapes) { + GenericTensorAccessorW tensor_backing = + allocator.allocate_tensor(tensor_type_shape.second); + all_tensor_backings.insert({tensor_type_shape.first, tensor_backing}); } - return UnallocatedTensors{tensor_type_shapes, gradient_mapping, {}}; + return all_tensor_backings; } -UnallocatedTensors generate_unallocated_tensors_with_optimizer( +LocalTensorBacking construct_local_tensor_backing( AllocatedTensors const &allocated_tensors, - std::unordered_map const &tensor_attrs_mapping, - GradientTensorSource &gradient_tensor_source, - OptimizerTensorSource &optimizer_tensor_source, - OptimizerAttrs const &optimizer_attrs) { - - UnallocatedTensors unallocated_tensors = generate_unallocated_tensors( - allocated_tensors, tensor_attrs_mapping, gradient_tensor_source); - - if (!get_num_optimizer_tensors(optimizer_attrs)) { - return unallocated_tensors; - } - - std::unordered_map tensor_type_shapes = - unallocated_tensors.tensor_type_shapes; - std::unordered_map gradient_mapping = - unallocated_tensors.gradient_mapping; - std::unordered_map> - optimizer_mapping; - - for (std::pair const &tensor_guid_attrs : - tensor_attrs_mapping) { - tensor_guid_t tensor_guid = tensor_guid_attrs.first; - TensorAttrs tensor_attrs = tensor_guid_attrs.second; - if (tensor_attrs.create_gradients == CreateGrad::YES) { - std::vector optimizer_tensors; - - int num_optimizer_tensors_to_allocate = - get_num_optimizer_tensors(optimizer_attrs); - if (allocated_tensors.optimizer_mapping.count(tensor_guid)) { - num_optimizer_tensors_to_allocate -= - allocated_tensors.optimizer_mapping.at(tensor_guid).size(); - } - std::cout << num_optimizer_tensors_to_allocate; - - for (int i = 0; i < num_optimizer_tensors_to_allocate; ++i) { - optimizer_tensor_t optimizer_tensor = - optimizer_tensor_source.new_optimizer_tensor(); - optimizer_tensors.push_back(optimizer_tensor); - tensor_type_shapes.insert( - {TensorTypeVariant{optimizer_tensor}, tensor_attrs.shape}); - } - - if (num_optimizer_tensors_to_allocate > 0) { - optimizer_mapping.insert({tensor_guid, optimizer_tensors}); - } - } - } - - return UnallocatedTensors{ - tensor_type_shapes, gradient_mapping, optimizer_mapping}; + UnallocatedTensors const &unallocated_tensors, + Allocator &allocator) { + + std::unordered_map merged_gradient_maps = + allocated_tensors.gradient_mapping; + merged_gradient_maps.insert(unallocated_tensors.gradient_mapping.begin(), + unallocated_tensors.gradient_mapping.end()); + + return LocalTensorBacking{ + get_tensor_backings(allocated_tensors.tensor_type_backings, + unallocated_tensors.tensor_type_shapes, + allocator), + merged_gradient_maps, + merge_optimizer_mappings(allocated_tensors.optimizer_mapping, + unallocated_tensors.optimizer_mapping)}; } TensorSlotsBacking construct_tensor_slots_backing( @@ -205,7 +86,7 @@ TensorSlotsBacking construct_tensor_slots_backing( for (std::pair const &tensor_binding : binding.get_tensor_bindings()) { mapping.insert({tensor_binding.first, - local_tensor_backing.get_tensor(tensor_binding.second)}); + get_tensor(local_tensor_backing, tensor_binding.second)}); } return mapping; diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index f09234b920..576ab53859 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -2,6 +2,7 @@ #include "local-execution/loss_functions.h" #include "local-execution/optimizer.h" #include "local-execution/task_signature_impl.h" +#include "local-execution/unallocated_tensors.h" #include "pcg/computation_graph.h" #include "pcg/optimizer_attrs.h" #include "task-spec/op_task_to_task_invocation.h" @@ -15,26 +16,28 @@ namespace FlexFlow { LocalTrainingBacking::LocalTrainingBacking( - Allocator const &allocator, + Allocator &allocator, AllocatedTensors const &allocated_tensors, ComputationGraph const &computation_graph, RuntimeArgConfig const &runtime_arg_config) : computation_graph(computation_graph), task_registry(construct_task_registry( get_layer_attrs_mapping(this->computation_graph))), - local_tensor_backing(allocated_tensors, - generate_unallocated_tensors( - allocated_tensors, - get_all_tensor_attrs(this->computation_graph), - this->gradient_tensor_source), - allocator), + local_tensor_backing(construct_local_tensor_backing( + allocated_tensors, + generate_unallocated_tensors( + allocated_tensors, + get_all_tensor_attrs(this->computation_graph), + this->gradient_tensor_source), + allocator)), local_args_backing(initialize_args_backing(this->task_registry, this->computation_graph, runtime_arg_config, - this->local_tensor_backing)){}; + this->local_tensor_backing, + allocator)){}; LocalTrainingBacking::LocalTrainingBacking( - Allocator const &allocator, + Allocator &allocator, AllocatedTensors const &allocated_tensors, ComputationGraph const &computation_graph, RuntimeArgConfig const &runtime_arg_config, @@ -42,24 +45,27 @@ LocalTrainingBacking::LocalTrainingBacking( : computation_graph(computation_graph), task_registry(construct_task_registry( get_layer_attrs_mapping(this->computation_graph))), - local_tensor_backing(allocated_tensors, - generate_unallocated_tensors_with_optimizer( - allocated_tensors, - get_all_tensor_attrs(this->computation_graph), - this->gradient_tensor_source, - this->optimizer_tensor_source, - optimizer_attrs), - allocator), + local_tensor_backing(construct_local_tensor_backing( + allocated_tensors, + generate_unallocated_tensors_with_optimizer( + allocated_tensors, + get_all_tensor_attrs(this->computation_graph), + this->gradient_tensor_source, + this->optimizer_tensor_source, + optimizer_attrs), + allocator)), local_args_backing(initialize_args_backing(this->task_registry, this->computation_graph, runtime_arg_config, - this->local_tensor_backing)){}; + this->local_tensor_backing, + allocator)){}; LocalArgsBacking initialize_args_backing(TaskRegistry const &task_registry, ComputationGraph const &cg, RuntimeArgConfig const &runtime_arg_config, - LocalTensorBacking const &local_tensor_backing) { + LocalTensorBacking const &local_tensor_backing, + Allocator &allocator) { std::unordered_map per_device_op_states; for (layer_guid_t const &node : topological_ordering(cg)) { @@ -79,7 +85,8 @@ LocalArgsBacking TaskArgumentAccessor accessor = get_task_arg_accessor( local_tensor_backing, make_args_backing_with_empty_device_states(runtime_arg_config), - invocation); + invocation, + allocator); TaskSignatureAndImpl task_sig_impl = task_registry.task_mapping.at(invocation.task_id); auto fn = task_sig_impl.impl_function.get() @@ -103,7 +110,8 @@ std::optional call_task_impl(TaskRegistry const &task_registry, std::optional execute_forward(LocalTrainingBacking const &local_training_backing, - layer_guid_t const &operator_node) { + layer_guid_t const &operator_node, + Allocator &allocator) { if (registry_contains_task_for_layer(local_training_backing.task_registry, operator_node, OpTaskType::FWD)) { @@ -130,7 +138,8 @@ std::optional TaskArgumentAccessor accessor = get_task_arg_accessor(local_training_backing.local_tensor_backing, local_training_backing.local_args_backing, - invocation); + invocation, + allocator); return call_task_impl( local_training_backing.task_registry, invocation.task_id, accessor); } else { @@ -141,7 +150,8 @@ std::optional void compute_loss(LocalTrainingBacking const &local_training_backing, LossAttrs const &loss_attrs, tensor_guid_t const &logit_tensor, - loss_tensor_t const &label_tensor) { + loss_tensor_t const &label_tensor, + Allocator &allocator) { TaskInvocation loss_invocation = backward( loss_attrs, logit_tensor, @@ -153,14 +163,16 @@ void compute_loss(LocalTrainingBacking const &local_training_backing, TaskArgumentAccessor loss_accessor = get_task_arg_accessor(local_training_backing.local_tensor_backing, local_training_backing.local_args_backing, - loss_invocation); + loss_invocation, + allocator); TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl(); loss_impl_fn.get().function_ptr(loss_accessor); } std::optional execute_backward(LocalTrainingBacking const &local_training_backing, - layer_guid_t const &operator_node) { + layer_guid_t const &operator_node, + Allocator &allocator) { if (registry_contains_task_for_layer(local_training_backing.task_registry, operator_node, OpTaskType::BWD)) { @@ -187,7 +199,8 @@ std::optional TaskArgumentAccessor accessor = get_task_arg_accessor(local_training_backing.local_tensor_backing, local_training_backing.local_args_backing, - invocation); + invocation, + allocator); return call_task_impl( local_training_backing.task_registry, invocation.task_id, accessor); } else { @@ -197,7 +210,8 @@ std::optional void execute_update(LocalTrainingBacking const &local_training_backing, layer_guid_t const &node, - OptimizerAttrs const &optimizer_attrs) { + OptimizerAttrs const &optimizer_attrs, + Allocator &allocator) { LayerAttrs layer_attrs = get_layer_attrs(local_training_backing.computation_graph, node); if (layer_attrs.attrs.has()) { @@ -225,7 +239,8 @@ void execute_update(LocalTrainingBacking const &local_training_backing, TaskArgumentAccessor accessor = get_task_arg_accessor(local_training_backing.local_tensor_backing, local_training_backing.local_args_backing, - invocation); + invocation, + allocator); TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs); update_impl_fn.get().function_ptr(accessor); } @@ -234,13 +249,14 @@ void execute_update(LocalTrainingBacking const &local_training_backing, TaskArgumentAccessor get_task_arg_accessor(LocalTensorBacking const &local_tensor_backing, LocalArgsBacking const &local_args_backing, - TaskInvocation const &invocation) { + TaskInvocation const &invocation, + Allocator &allocator) { TensorSlotsBacking tensor_slots_backing = construct_tensor_slots_backing(local_tensor_backing, invocation.binding); ArgSlotsBacking arg_slots_backing = construct_arg_slots_backing( invocation.binding, local_args_backing.runtime_arg_config); return TaskArgumentAccessor::create( - local_tensor_backing.allocator, tensor_slots_backing, arg_slots_backing); + allocator, tensor_slots_backing, arg_slots_backing); } } // namespace FlexFlow diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc index 4a22937174..d404221d88 100644 --- a/lib/local-execution/src/model_training_instance.cc +++ b/lib/local-execution/src/model_training_instance.cc @@ -6,53 +6,52 @@ namespace FlexFlow { ModelTrainingInstance::ModelTrainingInstance( + Allocator const &allocator, LocalTrainingBacking const &local_training_backing, tensor_guid_t const &logit_tensor, loss_tensor_t const &label_tensor, LossAttrs const &loss_attrs, OptimizerAttrs const &optimizer_attrs) - : training_backing(local_training_backing), loss_attrs(loss_attrs), - optimizer_attrs(optimizer_attrs), logit_tensor(logit_tensor), - label_tensor(label_tensor){}; + : allocator(allocator), training_backing(local_training_backing), + loss_attrs(loss_attrs), optimizer_attrs(optimizer_attrs), + logit_tensor(logit_tensor), label_tensor(label_tensor){}; -PerLayerElapsedTime - forward(ModelTrainingInstance const &model_training_instance) { +PerLayerElapsedTime ModelTrainingInstance::forward() { PerLayerElapsedTime per_layer_elapsed_time; - for (layer_guid_t const &node : topological_ordering( - model_training_instance.training_backing.computation_graph)) { + for (layer_guid_t const &node : + topological_ordering(this->training_backing.computation_graph)) { std::optional elapsed_time = - execute_forward(model_training_instance.training_backing, node); + execute_forward(this->training_backing, node, this->allocator); per_layer_elapsed_time.insert({node, elapsed_time}); } return per_layer_elapsed_time; } -PerLayerElapsedTime - backward(ModelTrainingInstance const &model_training_instance) { - compute_loss(model_training_instance.training_backing, - model_training_instance.loss_attrs, - model_training_instance.logit_tensor, - model_training_instance.label_tensor); +PerLayerElapsedTime ModelTrainingInstance::backward() { + compute_loss(this->training_backing, + this->loss_attrs, + this->logit_tensor, + this->label_tensor, + this->allocator); PerLayerElapsedTime per_layer_elapsed_time; - for (layer_guid_t const &node : reversed(topological_ordering( - model_training_instance.training_backing.computation_graph))) { + for (layer_guid_t const &node : reversed( + topological_ordering(this->training_backing.computation_graph))) { std::optional elapsed_time = - execute_backward(model_training_instance.training_backing, node); + execute_backward(this->training_backing, node, this->allocator); per_layer_elapsed_time.insert({node, elapsed_time}); } return per_layer_elapsed_time; } -void update(ModelTrainingInstance &model_training_instance) { - for (layer_guid_t const &node : topological_ordering( - model_training_instance.training_backing.computation_graph)) { - execute_update(model_training_instance.training_backing, - node, - model_training_instance.optimizer_attrs); +void ModelTrainingInstance::update() { + for (layer_guid_t const &node : + topological_ordering(this->training_backing.computation_graph)) { + execute_update( + this->training_backing, node, this->optimizer_attrs, this->allocator); } - model_training_instance.optimizer_attrs = get_optimizer_attrs_for_next_iter( - model_training_instance.optimizer_attrs); + this->optimizer_attrs = + get_optimizer_attrs_for_next_iter(this->optimizer_attrs); } } // namespace FlexFlow diff --git a/lib/local-execution/src/unallocated_tensors.cc b/lib/local-execution/src/unallocated_tensors.cc new file mode 100644 index 0000000000..ea64a46051 --- /dev/null +++ b/lib/local-execution/src/unallocated_tensors.cc @@ -0,0 +1,93 @@ +#include "local-execution/unallocated_tensors.h" +#include "local-execution/allocated_tensors.h" +#include "pcg/optimizer_attrs.h" + +namespace FlexFlow { + +UnallocatedTensors generate_unallocated_tensors( + AllocatedTensors const &allocated_tensors, + std::unordered_map const &tensor_attrs_mapping, + GradientTensorSource &gradient_tensor_source) { + + assert(are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping)); + + std::unordered_map tensor_type_shapes; + std::unordered_map gradient_mapping; + + for (std::pair const &tensor_guid_attrs : + tensor_attrs_mapping) { + tensor_guid_t tensor_guid = tensor_guid_attrs.first; + TensorAttrs tensor_attrs = tensor_guid_attrs.second; + TensorTypeVariant tensor_guid_type = TensorTypeVariant{tensor_guid}; + if (!allocated_tensors.tensor_type_backings.count(tensor_guid_type)) { + tensor_type_shapes.insert({tensor_guid_type, tensor_attrs.shape}); + } + + if (tensor_attrs.create_gradients == CreateGrad::YES && + !allocated_tensors.gradient_mapping.count(tensor_guid)) { + gradient_tensor_t gradient_tensor = + gradient_tensor_source.new_gradient_tensor(); + tensor_type_shapes.insert( + {TensorTypeVariant{gradient_tensor}, tensor_attrs.shape}); + gradient_mapping.insert({tensor_guid, gradient_tensor}); + } + } + + return UnallocatedTensors{tensor_type_shapes, gradient_mapping, {}}; +} + +UnallocatedTensors generate_unallocated_tensors_with_optimizer( + AllocatedTensors const &allocated_tensors, + std::unordered_map const &tensor_attrs_mapping, + GradientTensorSource &gradient_tensor_source, + OptimizerTensorSource &optimizer_tensor_source, + OptimizerAttrs const &optimizer_attrs) { + + UnallocatedTensors unallocated_tensors = generate_unallocated_tensors( + allocated_tensors, tensor_attrs_mapping, gradient_tensor_source); + + if (!get_num_optimizer_tensors(optimizer_attrs)) { + return unallocated_tensors; + } + + std::unordered_map tensor_type_shapes = + unallocated_tensors.tensor_type_shapes; + std::unordered_map gradient_mapping = + unallocated_tensors.gradient_mapping; + std::unordered_map> + optimizer_mapping; + + for (std::pair const &tensor_guid_attrs : + tensor_attrs_mapping) { + tensor_guid_t tensor_guid = tensor_guid_attrs.first; + TensorAttrs tensor_attrs = tensor_guid_attrs.second; + if (tensor_attrs.create_gradients == CreateGrad::YES) { + std::vector optimizer_tensors; + + int num_optimizer_tensors_to_allocate = + get_num_optimizer_tensors(optimizer_attrs); + if (allocated_tensors.optimizer_mapping.count(tensor_guid)) { + num_optimizer_tensors_to_allocate -= + allocated_tensors.optimizer_mapping.at(tensor_guid).size(); + } + std::cout << num_optimizer_tensors_to_allocate; + + for (int i = 0; i < num_optimizer_tensors_to_allocate; ++i) { + optimizer_tensor_t optimizer_tensor = + optimizer_tensor_source.new_optimizer_tensor(); + optimizer_tensors.push_back(optimizer_tensor); + tensor_type_shapes.insert( + {TensorTypeVariant{optimizer_tensor}, tensor_attrs.shape}); + } + + if (num_optimizer_tensors_to_allocate > 0) { + optimizer_mapping.insert({tensor_guid, optimizer_tensors}); + } + } + } + + return UnallocatedTensors{ + tensor_type_shapes, gradient_mapping, optimizer_mapping}; +} + +} // namespace FlexFlow diff --git a/lib/local-execution/test/CMakeLists.txt b/lib/local-execution/test/CMakeLists.txt index fc647cff9b..4bcb37ea48 100644 --- a/lib/local-execution/test/CMakeLists.txt +++ b/lib/local-execution/test/CMakeLists.txt @@ -7,6 +7,7 @@ ff_add_test_executable( src/test_task_registry.cc src/test_utils.cc src/test_local_task_arg_accessor.cc + src/test_local_tensor_backing.cc PRIVATE_INCLUDE src/ DEPS diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc deleted file mode 100644 index e5ca58bc1f..0000000000 --- a/lib/local-execution/test/src/test_local_slots_backing.cc +++ /dev/null @@ -1,309 +0,0 @@ -#include "kernels/attention_kernels.h" -#include "local-execution/local_cost_estimator.h" -#include "local-execution/local_cpu_allocator.h" -#include "local-execution/local_tensor_backing.h" - -#include "op-attrs/ops/attention.h" -#include "op-attrs/parallel_tensor_shape.h" -#include "pcg/computation_graph.h" -#include "pcg/computation_graph_builder.h" -#include "test/utils/doctest/fmt/pair.h" -#include "test/utils/doctest/fmt/unordered_map.h" -#include "test/utils/doctest/fmt/variant.h" -#include "test/utils/doctest/fmt/vector.h" -#include "test_utils.h" -#include "utils/containers/get_only.h" -#include - -using namespace ::FlexFlow; - -TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("LocalTensorBacking -- Attention Op") { - // allocate input memory - Allocator allocator = create_local_cpu_memory_allocator(); - nonnegative_int embed_dim = 32_n; - nonnegative_int num_heads = 10_n; - - nonnegative_int batch_size = 40_n; - nonnegative_int seq_len = 48_n; - nonnegative_int feature_size = 36_n; - - DataType dtype = DataType::FLOAT; - TensorShape input_tensor_shape = TensorShape{ - TensorDims{ - FFOrdered{batch_size, seq_len, feature_size}}, - DataType::FLOAT, - }; - TensorShape query_shape = input_tensor_shape; - TensorShape key_shape = input_tensor_shape; - TensorShape value_shape = input_tensor_shape; - GenericTensorAccessorW query = allocator.allocate_tensor(query_shape); - GenericTensorAccessorW key = allocator.allocate_tensor(key_shape); - GenericTensorAccessorW value = allocator.allocate_tensor(value_shape); - - // build graph - ComputationGraphBuilder cg_builder; - tensor_guid_t query_guid = - cg_builder.create_input(query_shape, CreateGrad::YES); - tensor_guid_t key_guid = - cg_builder.create_input(key_shape, CreateGrad::YES); - tensor_guid_t value_guid = - cg_builder.create_input(value_shape, CreateGrad::YES); - - std::string layer_name = "attn1"; - tensor_guid_t output_guid = - cg_builder.multihead_attention(query_guid, - key_guid, - value_guid, - embed_dim, - num_heads, - /*kdim=*/embed_dim, - /*vdim=*/embed_dim, - /*dropout=*/0.0f, - /*bias=*/true, - /*add_bias_kv=*/false, - /*add_zero_attn=*/false, - /*initializer=*/std::nullopt, - /*maybe_name=*/layer_name); - - layer_guid_t layer_guid = - get_layer_by_name(cg_builder.computation_graph, layer_name); - - LayerTensorBackingMap layer_tensor_backing_map = { - {LayerTensorKey{layer_guid, lower(query_guid)}, query}, - {LayerTensorKey{layer_guid, lower(key_guid)}, key}, - {LayerTensorKey{layer_guid, lower(value_guid)}, value}, - //{LayerTensorKey{layer_guid, lower(output_guid), output}} - }; - - // runtime arg config - ProfilingSettings settings = ProfilingSettings{/*warmup_iters=*/0, - /*measure_iters=*/0}; - PerDeviceFFHandle handle = get_mock_per_device_ff_handle(); - RuntimeArgConfig runtime_arg_config = - RuntimeArgConfig{DeviceSpecific::create(handle), - EnableProfiling::NO, - settings}; - - LocalTensorBacking local_tensor_backing = { - layer_tensor_backing_map, TensorBackingMap{}, runtime_arg_config}; - - SUBCASE("LocalTensorBacking::allocate_tensors_by_role") { - auto get_result_shape_and_dtype_for_tensor_guid_and_map = - [&](tensor_guid_t t, - layer_guid_t l, - LayerTensorBackingMap m) -> std::pair { - GenericTensorAccessorW accessor = m.at(LayerTensorKey{l, lower(t)}); - return get_shape_and_datatype(accessor); - }; - - SUBCASE("Input (QKV) and gradient tensors allocation") { - - // allocate all tensors from input nodes - local_tensor_backing.allocate_tensors_by_role( - TensorRole::INPUT, - layer_guid, - cg_builder.computation_graph, - allocator); - - SUBCASE("Query grad") { - std::pair result = - get_result_shape_and_dtype_for_tensor_guid_and_map( - query_guid, - layer_guid, - local_tensor_backing.gradient_tensor_mapping); - std::pair correct = {ArrayShape{query_shape}, - dtype}; - CHECK(result == correct); - } - SUBCASE("Key grad") { - std::pair result = - get_result_shape_and_dtype_for_tensor_guid_and_map( - key_guid, - layer_guid, - local_tensor_backing.gradient_tensor_mapping); - std::pair correct = {ArrayShape{key_shape}, - dtype}; - CHECK(result == correct); - } - SUBCASE("Value grad") { - std::pair result = - get_result_shape_and_dtype_for_tensor_guid_and_map( - value_guid, - layer_guid, - local_tensor_backing.gradient_tensor_mapping); - std::pair correct = {ArrayShape{value_shape}, - dtype}; - CHECK(result == correct); - } - } - SUBCASE("Output and gradient tensors allocation") { - local_tensor_backing.allocate_tensors_by_role( - TensorRole::OUTPUT, - layer_guid, - cg_builder.computation_graph, - allocator); - SUBCASE("Output") { - std::pair result = - get_result_shape_and_dtype_for_tensor_guid_and_map( - output_guid, layer_guid, local_tensor_backing.tensor_mapping); - std::pair correct = { - ArrayShape{ - get_tensor_attrs(cg_builder.computation_graph, output_guid) - .shape}, - dtype}; - CHECK(result == correct); - } - SUBCASE("Output grad") { - std::pair result = - get_result_shape_and_dtype_for_tensor_guid_and_map( - output_guid, - layer_guid, - local_tensor_backing.gradient_tensor_mapping); - std::pair correct = { - ArrayShape{ - get_tensor_attrs(cg_builder.computation_graph, output_guid) - .shape}, - dtype}; - CHECK(result == correct); - } - } - - SUBCASE("Tensor slots") { - local_tensor_backing.allocate_layer_tensors( - layer_guid, cg_builder.computation_graph, allocator); - SUBCASE("Input tensor slots") { - std::vector correct_incoming_input_tensors = - transform( - get_incoming_inputs(cg_builder.computation_graph, layer_guid), - [&](tensor_guid_t const &tensor_guid) { - return lower(tensor_guid); - }); - CHECK(correct_incoming_input_tensors == - local_tensor_backing.input_tensor_slots.at(layer_guid)); - } - SUBCASE("Weight tensor slots") { - std::vector correct_incoming_weight_tensors = - transform(get_incoming_weights(cg_builder.computation_graph, - layer_guid), - [&](tensor_guid_t const &tensor_guid) { - return lower(tensor_guid); - }); - CHECK(correct_incoming_weight_tensors == - local_tensor_backing.weight_tensor_slots.at(layer_guid)); - } - SUBCASE("Output tensor slots") { - std::vector correct_output_tensors = transform( - get_outgoing_tensors(cg_builder.computation_graph, layer_guid), - [&](tensor_guid_t const &tensor_guid) { - return lower(tensor_guid); - }); - CHECK(correct_output_tensors == - local_tensor_backing.output_tensor_slots.at(layer_guid)); - } - } - } - - SUBCASE("Construct Slots Backings") { - enum Slots { - QUERY, - KEY, - VALUE, - WEIGHTS, - OUTPUT, - QUERY_PARALLEL_TENSOR_SHAPE, - QPROJSIZE, - ATTRS, - PROFILING, - HANDLE, - }; - MultiHeadAttentionAttrs attrs = - get_layer_attrs(cg_builder.computation_graph, layer_guid) - .attrs.get(); - OpTaskBinding binding = [&] { - OpTaskBinding b; - b.bind(QUERY, input_tensor(0)); - b.bind(KEY, input_tensor(1)); - b.bind(VALUE, input_tensor(2)); - b.bind(WEIGHTS, weight_tensor(0)); - b.bind(OUTPUT, output_tensor(0)); - - b.bind_grad(QUERY, input_tensor(0)); - - b.bind_arg(QPROJSIZE, get_qProjSize(attrs)); - b.bind_arg(ATTRS, attrs); - b.bind_arg(QUERY_PARALLEL_TENSOR_SHAPE, input_parallel_tensor_shape(0)); - b.bind_arg(PROFILING, profiling_settings()); - b.bind_arg(HANDLE, ff_handle()); - return b; - }(); - - local_tensor_backing.allocate_layer_tensors( - layer_guid, cg_builder.computation_graph, allocator); - - SUBCASE("LocalTensorBacking::construct_tensor_slots_backing") { - TensorSlotsBackingWithoutAddresses result = - get_slots_backing_without_tensor_allocation_addresses( - local_tensor_backing.construct_tensor_slots_backing( - binding, layer_guid)); - TensorSlotsBackingWithoutAddresses correct = [&] { - TensorShape weights_shape = throw_if_unexpected( - get_weights_shape(attrs, query_shape, key_shape, value_shape)); - GenericTensorAccessorW weights = - allocator.allocate_tensor(weights_shape); - - TensorAttrs output_attrs = - get_tensor_attrs(cg_builder.computation_graph, output_guid); - GenericTensorAccessorW output = - allocator.allocate_tensor(output_attrs.shape); - return get_slots_backing_without_tensor_allocation_addresses( - TensorSlotsBacking{ - {SlotTensorTypeId{slot_id_t{QUERY}, TensorType::FORWARD}, - query}, - {SlotTensorTypeId{slot_id_t{KEY}, TensorType::FORWARD}, key}, - {SlotTensorTypeId{slot_id_t{VALUE}, TensorType::FORWARD}, - value}, - {SlotTensorTypeId{slot_id_t{WEIGHTS}, TensorType::FORWARD}, - weights}, - {SlotTensorTypeId{slot_id_t{OUTPUT}, TensorType::FORWARD}, - output}, - {SlotTensorTypeId{slot_id_t{QUERY}, TensorType::GRADIENT}, - query}}); - }(); - - CHECK(result == correct); - } - SUBCASE("LocalTensorBacking::construct_arg_slots_backing") { - ArgSlotsBacking result = - local_tensor_backing.construct_arg_slots_backing(binding, - layer_guid); - - ArgSlotsBacking correct = [&] { - ParallelTensorShape query_parallel_tensor_shape = - lift_to_parallel(query_shape); - - return ArgSlotsBacking{ - {slot_id_t{QPROJSIZE}, - ConcreteArgSpec::create(get_qProjSize(attrs))}, - {slot_id_t{ATTRS}, ConcreteArgSpec::create(attrs)}, - {slot_id_t{QUERY_PARALLEL_TENSOR_SHAPE}, - ConcreteArgSpec::create(query_parallel_tensor_shape)}, - {slot_id_t{PROFILING}, - ConcreteArgSpec::create(runtime_arg_config.profiling_settings)}, - {slot_id_t{HANDLE}, ConcreteArgSpec::create(handle)}}; - }(); - - CHECK(result == correct); - } - - SUBCASE("LocalTensorBacking::resolve_runtime_arg_ref_spec") { - RuntimeArgRefSpec ref_spec = RuntimeArgRefSpec::create(ff_handle()); - ConcreteArgSpec arg_spec = - local_tensor_backing.resolve_runtime_arg_ref_spec(ref_spec); - - PerDeviceFFHandle result_handle = arg_spec.get(); - CHECK(result_handle == handle); - } - } - } -} diff --git a/lib/local-execution/test/src/test_local_tensor_backing.cc b/lib/local-execution/test/src/test_local_tensor_backing.cc new file mode 100644 index 0000000000..083b677e18 --- /dev/null +++ b/lib/local-execution/test/src/test_local_tensor_backing.cc @@ -0,0 +1,152 @@ +#include "local-execution/local_cpu_allocator.h" +#include "local-execution/local_tensor_backing.h" +#include "test_utils.h" +#include "utils/containers/keys.h" +#include + +using namespace ::FlexFlow; + +bool is_shape_and_dtype_equal_for_tensor_backings( + std::unordered_map const &m1, + std::unordered_map const &m2) { + if (keys(m1) == keys(m2)) { + for (std::pair const + &tensor_type_backing : m1) { + if (is_shape_and_dtype_equal(tensor_type_backing.second, + m2.at(tensor_type_backing.first))) { + continue; + } else { + return false; + } + } + return true; + } else { + return false; + } +} + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("LocalTensorBacking") { + MockTensorGuidSource tensor_guid_source; + GradientTensorSource gradient_tensor_source; + OptimizerTensorSource optimizer_tensor_source; + LossTensorSource loss_tensor_source; + + SUBCASE("merge_optimizer_mappings") { + SUBCASE("Both empty") { + std::unordered_map> + result = merge_optimizer_mappings({}, {}); + std::unordered_map> + correct = {}; + CHECK(result == correct); + } + + tensor_guid_t allocated_tensor_guid = + tensor_guid_source.new_mock_tensor_guid(); + optimizer_tensor_t optimizer_tensor_1 = + optimizer_tensor_source.new_optimizer_tensor(); + optimizer_tensor_t optimizer_tensor_2 = + optimizer_tensor_source.new_optimizer_tensor(); + std::unordered_map> + correct = {{allocated_tensor_guid, + {optimizer_tensor_1, optimizer_tensor_2}}}; + SUBCASE("Unallocated is empty") { + std::unordered_map> + allocated = {{allocated_tensor_guid, + {optimizer_tensor_1, optimizer_tensor_2}}}; + std::unordered_map> + result = merge_optimizer_mappings(allocated, {}); + CHECK(result == correct); + } + SUBCASE("Allocated is empty") { + std::unordered_map> + unallocated = {{allocated_tensor_guid, + {optimizer_tensor_1, optimizer_tensor_2}}}; + std::unordered_map> + result = merge_optimizer_mappings({}, unallocated); + CHECK(result == correct); + } + + SUBCASE("Both are partially allocated") { + std::unordered_map> + allocated = {{allocated_tensor_guid, {optimizer_tensor_1}}}; + std::unordered_map> + unallocated = {{allocated_tensor_guid, {optimizer_tensor_2}}}; + std::unordered_map> + result = merge_optimizer_mappings(allocated, unallocated); + CHECK(result == correct); + } + } + + SUBCASE("get_tensor_backings") { + Allocator allocator = create_local_cpu_memory_allocator(); + SUBCASE("Both are empty") { + std::unordered_map result = + get_tensor_backings({}, {}, allocator); + std::unordered_map correct = + {}; + CHECK(result == correct); + } + + tensor_guid_t allocated_tensor_guid = + tensor_guid_source.new_mock_tensor_guid(); + tensor_guid_t unallocated_tensor_guid = + tensor_guid_source.new_mock_tensor_guid(); + + TensorAttrs allocated_tensor_attrs = TensorAttrs{ + TensorShape{TensorDims{FFOrdered{16_n, 10_n}}, + DataType::FLOAT}, + std::nullopt, + std::nullopt, + CreateGrad::NO}; + TensorAttrs unallocated_tensor_attrs = TensorAttrs{ + TensorShape{TensorDims{FFOrdered{16_n, 20_n}}, + DataType::FLOAT}, + std::nullopt, + std::nullopt, + CreateGrad::YES}; + + GenericTensorAccessorW allocated_tensor_backing = + allocator.allocate_tensor(allocated_tensor_attrs.shape); + GenericTensorAccessorW unallocated_tensor_backing = + allocator.allocate_tensor(unallocated_tensor_attrs.shape); + + SUBCASE("Unallocated is empty") { + std::unordered_map + allocated = {{TensorTypeVariant{allocated_tensor_guid}, + allocated_tensor_backing}}; + std::unordered_map result = + get_tensor_backings(allocated, {}, allocator); + CHECK(result == allocated); + } + SUBCASE("Allocated is empty") { + std::unordered_map unallocated = { + {TensorTypeVariant{unallocated_tensor_guid}, + unallocated_tensor_attrs.shape}}; + std::unordered_map result = + get_tensor_backings({}, unallocated, allocator); + std::unordered_map correct = + {{TensorTypeVariant{unallocated_tensor_guid}, + unallocated_tensor_backing}}; + CHECK(is_shape_and_dtype_equal_for_tensor_backings(result, correct)); + } + SUBCASE("Both are partially allocated") { + std::unordered_map + allocated = {{TensorTypeVariant{allocated_tensor_guid}, + allocated_tensor_backing}}; + std::unordered_map unallocated = { + {TensorTypeVariant{unallocated_tensor_guid}, + unallocated_tensor_attrs.shape}}; + + std::unordered_map result = + get_tensor_backings(allocated, unallocated, allocator); + std::unordered_map correct = + {{TensorTypeVariant{allocated_tensor_guid}, + allocated_tensor_backing}, + {TensorTypeVariant{unallocated_tensor_guid}, + unallocated_tensor_backing}}; + CHECK(is_shape_and_dtype_equal_for_tensor_backings(result, correct)); + } + } + } +} diff --git a/lib/local-execution/test/src/test_task_registry.cc b/lib/local-execution/test/src/test_task_registry.cc index 20b4f11a2a..dd4b6f5b44 100644 --- a/lib/local-execution/test/src/test_task_registry.cc +++ b/lib/local-execution/test/src/test_task_registry.cc @@ -143,5 +143,74 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(task_registry != other_task_registry); } } + + SUBCASE("registry_contains_task_for_layer") { + SUBCASE("Task exists") { + TaskRegistry task_registry = construct_task_registry({ + {layer_guid, LayerAttrs{attrs, std::nullopt}}, + }); + SUBCASE("Init") { + bool result = registry_contains_task_for_layer( + task_registry, layer_guid, OpTaskType::INIT); + CHECK(result == true); + } + SUBCASE("Fwd") { + bool result = registry_contains_task_for_layer( + task_registry, layer_guid, OpTaskType::FWD); + CHECK(result == true); + } + SUBCASE("Bwd") { + bool result = registry_contains_task_for_layer( + task_registry, layer_guid, OpTaskType::BWD); + CHECK(result == true); + } + } + + SUBCASE("Partial task does not exist") { + ComputationGraphOpAttrs bmm_attrs = ComputationGraphOpAttrs{ + BatchMatmulAttrs{/*a_seq_length_dim=*/10_n, + /*b_seq_length_dim=*/20_n}}; + TaskRegistry task_registry = construct_task_registry({ + {layer_guid, LayerAttrs{bmm_attrs, std::nullopt}}, + }); + SUBCASE("Init") { + bool result = registry_contains_task_for_layer( + task_registry, layer_guid, OpTaskType::INIT); + CHECK(result == false); + } + SUBCASE("Fwd") { + bool result = registry_contains_task_for_layer( + task_registry, layer_guid, OpTaskType::FWD); + CHECK(result == true); + } + SUBCASE("Bwd") { + bool result = registry_contains_task_for_layer( + task_registry, layer_guid, OpTaskType::BWD); + CHECK(result == true); + } + } + + SUBCASE("Empty tasks") { + std::unordered_map> + empty_task_ids = {{layer_guid, std::nullopt}}; + TaskRegistry task_registry = + TaskRegistry{empty_task_ids, empty_task_ids, empty_task_ids, {}}; + SUBCASE("Init") { + bool result = registry_contains_task_for_layer( + task_registry, layer_guid, OpTaskType::INIT); + CHECK(result == false); + } + SUBCASE("Fwd") { + bool result = registry_contains_task_for_layer( + task_registry, layer_guid, OpTaskType::FWD); + CHECK(result == false); + } + SUBCASE("Bwd") { + bool result = registry_contains_task_for_layer( + task_registry, layer_guid, OpTaskType::BWD); + CHECK(result == false); + } + } + } } } diff --git a/lib/local-execution/test/src/test_unallocated_tensors.cc b/lib/local-execution/test/src/test_unallocated_tensors.cc index 00f4c1c27c..662e7b1878 100644 --- a/lib/local-execution/test/src/test_unallocated_tensors.cc +++ b/lib/local-execution/test/src/test_unallocated_tensors.cc @@ -1,9 +1,9 @@ #include "local-execution/allocated_tensors.h" #include "local-execution/gradient_tensor_source.h" #include "local-execution/local_cpu_allocator.h" -#include "local-execution/local_tensor_backing.h" #include "local-execution/loss_tensor_source.h" #include "local-execution/optimizer_tensor_source.h" +#include "local-execution/unallocated_tensors.h" #include "pcg/computation_graph.dtg.h" #include "test/utils/doctest/fmt/pair.h" #include "test/utils/doctest/fmt/unordered_map.h" From aef8ad58196f7b7f724fc7f0a1a65af24ee12acd Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Sat, 22 Feb 2025 07:43:31 -0800 Subject: [PATCH 48/91] Remove lowered tensor source --- .../local-execution/local_tensor_backing.h | 1 - .../local-execution/lowered_tensor_source.h | 20 ------------------- .../src/lowered_tensor_source.cc | 14 ------------- 3 files changed, 35 deletions(-) delete mode 100644 lib/local-execution/include/local-execution/lowered_tensor_source.h delete mode 100644 lib/local-execution/src/lowered_tensor_source.cc diff --git a/lib/local-execution/include/local-execution/local_tensor_backing.h b/lib/local-execution/include/local-execution/local_tensor_backing.h index 70a2474159..f6168f2fb1 100644 --- a/lib/local-execution/include/local-execution/local_tensor_backing.h +++ b/lib/local-execution/include/local-execution/local_tensor_backing.h @@ -8,7 +8,6 @@ #include "local-execution/local_task_argument_accessor.h" #include "local-execution/local_tensor_backing.dtg.h" #include "local-execution/loss_tensor_source.h" -#include "local-execution/lowered_tensor_source.h" #include "local-execution/optimizer_tensor_source.h" #include "local-execution/unallocated_tensors.dtg.h" #include "pcg/computation_graph.dtg.h" diff --git a/lib/local-execution/include/local-execution/lowered_tensor_source.h b/lib/local-execution/include/local-execution/lowered_tensor_source.h deleted file mode 100644 index bd0b90dd75..0000000000 --- a/lib/local-execution/include/local-execution/lowered_tensor_source.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_LOWERED_TENSOR_SOURCE_H -#define _FLEXFLOW_LOCAL_EXECUTION_LOWERED_TENSOR_SOURCE_H - -#include "task-spec/lowered_tensor_t.dtg.h" - -namespace FlexFlow { - -struct LoweredTensorSource { -public: - LoweredTensorSource(); - - lowered_tensor_t new_lowered_tensor(); - -private: - static size_t next_available_lowered_tensor_id; -}; - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/src/lowered_tensor_source.cc b/lib/local-execution/src/lowered_tensor_source.cc deleted file mode 100644 index af80aa2335..0000000000 --- a/lib/local-execution/src/lowered_tensor_source.cc +++ /dev/null @@ -1,14 +0,0 @@ -#include "local-execution/lowered_tensor_source.h" - -namespace FlexFlow { - -size_t LoweredTensorSource::next_available_lowered_tensor_id = 0; - -LoweredTensorSource::LoweredTensorSource() {} - -lowered_tensor_t LoweredTensorSource::new_lowered_tensor() { - return lowered_tensor_t{ - LoweredTensorSource::next_available_lowered_tensor_id++}; -} - -} // namespace FlexFlow From f0a4285bf4262bc793f9e4e8f4aa4e2c51d048fd Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Sun, 23 Feb 2025 17:25:21 -0800 Subject: [PATCH 49/91] Loss and update tests --- .../local-execution/allocated_tensors.h | 2 + lib/local-execution/src/allocated_tensors.cc | 4 + lib/local-execution/test/CMakeLists.txt | 7 +- .../test/src/test_local_cost_estimator.cc | 138 +++++++++--------- lib/local-execution/test/src/test_loss_e2e.cc | 97 ------------ .../test/src/test_loss_functions.cc | 127 ++++++++++++++++ lib/local-execution/test/src/test_update.cc | 115 +++++++++++++++ .../test/src/test_update_e2e.cc | 93 ------------ 8 files changed, 319 insertions(+), 264 deletions(-) delete mode 100644 lib/local-execution/test/src/test_loss_e2e.cc create mode 100644 lib/local-execution/test/src/test_loss_functions.cc create mode 100644 lib/local-execution/test/src/test_update.cc delete mode 100644 lib/local-execution/test/src/test_update_e2e.cc diff --git a/lib/local-execution/include/local-execution/allocated_tensors.h b/lib/local-execution/include/local-execution/allocated_tensors.h index 7581a159ad..f3face6ace 100644 --- a/lib/local-execution/include/local-execution/allocated_tensors.h +++ b/lib/local-execution/include/local-execution/allocated_tensors.h @@ -25,6 +25,8 @@ bool is_allocated_tensor_backing_valid( std::unordered_map const &, ArrayShape const &); +AllocatedTensors make_empty_allocated_tensors(); + } // namespace FlexFlow #endif diff --git a/lib/local-execution/src/allocated_tensors.cc b/lib/local-execution/src/allocated_tensors.cc index 3e249bf6d1..2c40cc3b86 100644 --- a/lib/local-execution/src/allocated_tensors.cc +++ b/lib/local-execution/src/allocated_tensors.cc @@ -138,4 +138,8 @@ bool are_allocated_tensors_valid( are_allocated_optimizer_tensors_valid(allocated_tensors, tensor_attrs); } +AllocatedTensors make_empty_allocated_tensors() { + return AllocatedTensors{{}, {}, {}}; +} + } // namespace FlexFlow diff --git a/lib/local-execution/test/CMakeLists.txt b/lib/local-execution/test/CMakeLists.txt index 4bcb37ea48..930ab5c4e2 100644 --- a/lib/local-execution/test/CMakeLists.txt +++ b/lib/local-execution/test/CMakeLists.txt @@ -2,12 +2,7 @@ ff_add_test_executable( NAME local-execution-tests SRC_PATTERNS - src/test_allocated_tensors.cc - src/test_unallocated_tensors.cc - src/test_task_registry.cc - src/test_utils.cc - src/test_local_task_arg_accessor.cc - src/test_local_tensor_backing.cc + src/*.cc PRIVATE_INCLUDE src/ DEPS diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc index da3af6e3ad..7220d2a367 100644 --- a/lib/local-execution/test/src/test_local_cost_estimator.cc +++ b/lib/local-execution/test/src/test_local_cost_estimator.cc @@ -1,79 +1,81 @@ -// #include "doctest/doctest.h" -// #include "kernels/local_cuda_allocator.h" -// #include "kernels/managed_per_device_ff_handle.h" -// #include "local-execution/local_cost_estimator.h" -// #include "op-attrs/ops/attention.h" -// #include "op-attrs/parallel_tensor_shape.h" -// #include "pcg/computation_graph_builder.h" -// #include "test_utils.h" +#include "doctest/doctest.h" +#include "kernels/local_cuda_allocator.h" +#include "kernels/managed_per_device_ff_handle.h" +#include "local-execution/local_cost_estimator.h" +#include "op-attrs/ops/attention.h" +#include "op-attrs/parallel_tensor_shape.h" +#include "pcg/computation_graph_builder.h" +#include "test_utils.h" -// using namespace ::FlexFlow; +using namespace ::FlexFlow; -// TEST_SUITE(FF_CUDA_TEST_SUITE) { -// TEST_CASE("Local Cost Estimator") { -// // local backing initialization -// ManagedPerDeviceFFHandle managed_handle{}; +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("Local Cost Estimator") { + // local backing initialization + ManagedPerDeviceFFHandle managed_handle{}; -// RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ -// DeviceSpecific::create(managed_handle.raw_handle()), -// EnableProfiling::YES, -// ProfilingSettings{/*warmup_iters=*/0, -// /*measure_iters=*/1}}; + RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ + DeviceSpecific::create(managed_handle.raw_handle()), + EnableProfiling::YES, + ProfilingSettings{/*warmup_iters=*/0, + /*measure_iters=*/1}}; -// LocalCostEstimator cost_estimator = -// LocalCostEstimator{runtime_arg_config}; + LocalCostEstimator cost_estimator = LocalCostEstimator{runtime_arg_config}; -// SUBCASE("Estimate cost -- Attention Op") { -// int embed_dim = 32; -// int num_heads = 10; -// MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{ -// /*embed_dim=*/embed_dim, -// /*num_heads=*/num_heads, -// /*kdim=*/embed_dim, -// /*vdim=*/embed_dim, -// /*dropout=*/0.0, -// /*bias=*/true, -// /*add_bias_kv=*/false, -// /*add_zero_attn=*/false, -// }; + SUBCASE("Estimate cost -- Attention Op") { + nonnegative_int embed_dim = 32_n; + nonnegative_int num_heads = 10_n; + MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{ + /*embed_dim=*/embed_dim, + /*num_heads=*/num_heads, + /*kdim=*/embed_dim, + /*vdim=*/embed_dim, + /*dropout=*/0.0, + /*bias=*/true, + /*add_bias_kv=*/false, + /*add_zero_attn=*/false, + }; -// size_t batch_size = 40; -// size_t seq_len = 48; -// size_t feature_size = 36; + nonnegative_int batch_size = 40_n; + nonnegative_int seq_len = 48_n; + nonnegative_int feature_size = 36_n; -// DataType dtype = DataType::FLOAT; -// ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{ -// TensorDims{FFOrdered{batch_size, seq_len, feature_size}}, -// DataType::FLOAT, -// }); + DataType dtype = DataType::FLOAT; + ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{ + TensorDims{ + FFOrdered{batch_size, seq_len, feature_size}}, + DataType::FLOAT, + }); -// ParallelTensorShape weights_shape = throw_if_unexpected( -// get_weights_shape(attrs, inputs_shape, inputs_shape, -// inputs_shape)); -// ParallelTensorAttrs weight_attrs = -// ParallelTensorAttrs{weights_shape, -// /*sync_type=*/std::nullopt, -// /*initializer=*/std::nullopt, -// CreateGrad::YES}; + ParallelTensorShape weights_shape = throw_if_unexpected( + get_weights_shape(attrs, inputs_shape, inputs_shape, inputs_shape)); + ParallelTensorAttrs weight_attrs = + ParallelTensorAttrs{weights_shape, + /*sync_type=*/std::nullopt, + /*initializer=*/std::nullopt, + CreateGrad::YES}; -// ParallelTensorShape output_shape = throw_if_unexpected( -// get_output_shape(attrs, inputs_shape, inputs_shape, inputs_shape)); -// ParallelTensorAttrs output_attrs = -// ParallelTensorAttrs{output_shape, -// /*sync_type=*/std::nullopt, -// /*initializer=*/std::nullopt, -// CreateGrad::YES}; + ParallelTensorShape output_shape = throw_if_unexpected( + get_output_shape(attrs, inputs_shape, inputs_shape, inputs_shape)); + ParallelTensorAttrs output_attrs = + ParallelTensorAttrs{output_shape, + /*sync_type=*/std::nullopt, + /*initializer=*/std::nullopt, + CreateGrad::YES}; -// CostDetails result = cost_estimator.estimate_cost( -// PCGOperatorAttrs{attrs}, -// std::vector{ -// inputs_shape, inputs_shape, inputs_shape}, -// std::vector{weight_attrs}, -// std::vector{output_attrs}, -// make_1d_machine_view(gpu_id_t{0}, gpu_id_t{1})); + CostDetails result = cost_estimator.estimate_cost( + PCGOperatorAttrs{attrs}, + std::vector{ + inputs_shape, inputs_shape, inputs_shape}, + std::vector{weight_attrs}, + std::vector{output_attrs}, + make_1d_machine_view( + MachineSpaceCoordinate{0_n, 0_n, DeviceType::GPU}, + MachineSpecificationDimension::INTRA_NODE, + stride_t{0_n})); -// CHECK(result.total_elapsed_time > 0); -// CHECK(result.total_mem_usage > 0); -// } -// } -// } + CHECK(result.total_elapsed_time > 0); + CHECK(result.total_mem_usage > 0); + } + } +} diff --git a/lib/local-execution/test/src/test_loss_e2e.cc b/lib/local-execution/test/src/test_loss_e2e.cc deleted file mode 100644 index 62778c2e32..0000000000 --- a/lib/local-execution/test/src/test_loss_e2e.cc +++ /dev/null @@ -1,97 +0,0 @@ -#include "doctest/doctest.h" -#include "kernels/local_cuda_allocator.h" -#include "kernels/managed_ff_stream.h" -#include "kernels/managed_per_device_ff_handle.h" -#include "local-execution/local_training_backing.h" - -#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" -#include "pcg/computation_graph.h" -#include "pcg/computation_graph_builder.h" -#include "pcg/optimizer_attrs.dtg.h" -#include "test_utils.h" - -namespace FlexFlow { - -TEST_SUITE(FF_CUDA_TEST_SUITE) { - TEST_CASE("Local Execution E2E") { - // initialize runtime configs - ManagedPerDeviceFFHandle managed_handle{}; - - RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ - DeviceSpecific::create(managed_handle.raw_handle()), - EnableProfiling::YES, - ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}}; - - // construct graph - ComputationGraphBuilder cg_builder; - - size_t batch_size = 10; - size_t data_dim = 100; - TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; - tensor_guid_t input_tensor = - cg_builder.create_input(input_shape, CreateGrad::YES); - - float scalar = 4.0; - std::string layer_name = "scalar multiply"; - tensor_guid_t logit_tensor = - cg_builder.scalar_multiply(input_tensor, scalar, layer_name); - layer_guid_t layer_guid = - get_layer_by_name(cg_builder.computation_graph, layer_name); - - // allocate memory - Allocator allocator = create_local_cuda_memory_allocator(); - - LocalTrainingBacking local_backing(allocator, - cg_builder.computation_graph, - LayerTensorBackingMap{}, - TensorBackingMap{}, - runtime_arg_config); - - local_backing.register_and_allocate_layer(layer_guid); - - SUBCASE("SparseCategoricalCrossEntropyLossAttrs") { - TensorShape label_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, 1}}, DataType::FLOAT}; - lowered_tensor_t label_tensor = lowered_tensor_t{-1}; - GenericTensorAccessorW label_backing = - allocator.allocate_tensor(label_shape); - local_backing.local_tensor_backing.non_graph_tensor_mapping.insert( - {label_tensor, label_backing}); - LossAttrs loss_attrs = LossAttrs{ - SparseCategoricalCrossEntropyLossAttrs{/*replace_labels=*/false}}; - local_backing.compute_loss(loss_attrs, lower(logit_tensor), label_tensor); - } - - SUBCASE("NonconfigurableLossAttrs") { - lowered_tensor_t label_tensor = lowered_tensor_t{-1}; - GenericTensorAccessorW label_backing = - allocator.allocate_tensor(input_shape); - local_backing.local_tensor_backing.non_graph_tensor_mapping.insert( - {label_tensor, label_backing}); - - SUBCASE("LossFunction::CATEGORICAL_CROSSENTROPY") { - LossAttrs loss_attrs = LossAttrs{ - NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}; - local_backing.compute_loss( - loss_attrs, lower(logit_tensor), label_tensor); - } - - SUBCASE("LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE") { - LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{ - LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}; - local_backing.compute_loss( - loss_attrs, lower(logit_tensor), label_tensor); - } - - SUBCASE("LossFunction::IDENTITY") { - LossAttrs loss_attrs = - LossAttrs{NonconfigurableLossAttrs{LossFunction::IDENTITY}}; - local_backing.compute_loss( - loss_attrs, lower(logit_tensor), label_tensor); - } - } - } -} - -} // namespace FlexFlow diff --git a/lib/local-execution/test/src/test_loss_functions.cc b/lib/local-execution/test/src/test_loss_functions.cc new file mode 100644 index 0000000000..c0386a4171 --- /dev/null +++ b/lib/local-execution/test/src/test_loss_functions.cc @@ -0,0 +1,127 @@ +#include "doctest/doctest.h" +#include "kernels/local_cuda_allocator.h" +#include "kernels/managed_ff_stream.h" +#include "kernels/managed_per_device_ff_handle.h" +#include "local-execution/allocated_tensors.h" +#include "local-execution/local_training_backing.h" +#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" +#include "pcg/computation_graph.h" +#include "pcg/computation_graph_builder.h" +#include "pcg/optimizer_attrs.dtg.h" +#include "test_utils.h" + +namespace FlexFlow { + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("Loss Functions") { + Allocator allocator = create_local_cuda_memory_allocator(); + + // allocate label tensors + LossTensorSource loss_tensor_source; + loss_tensor_t label_for_nonconfigurable_loss_attrs = + loss_tensor_source.new_loss_tensor(); + loss_tensor_t label_for_sparse_cce_loss_attrs = + loss_tensor_source.new_loss_tensor(); + + nonnegative_int batch_size = 10_n; + nonnegative_int data_dim = 100_n; + + TensorShape input_tensor_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, data_dim}}, + DataType::FLOAT}; + TensorShape reduced_input_tensor_shape = + TensorShape{TensorDims{FFOrdered{batch_size, 1_n}}, + DataType::FLOAT}; + + GenericTensorAccessorW label_for_nonconfigurable_loss_attrs_backing = + allocator.allocate_tensor(reduced_input_tensor_shape); + GenericTensorAccessorW label_for_sparse_cce_loss_attrs_backing = + allocator.allocate_tensor(reduced_input_tensor_shape); + AllocatedTensors allocated_tensors = AllocatedTensors{ + {{TensorTypeVariant{label_for_nonconfigurable_loss_attrs}, + label_for_nonconfigurable_loss_attrs_backing}, + {TensorTypeVariant{label_for_sparse_cce_loss_attrs}, + label_for_sparse_cce_loss_attrs_backing}}, + {}, + {}}; + + // construct computation graph + ComputationGraph computation_graph = make_empty_computation_graph(); + + TensorAttrs input_tensor_attrs = TensorAttrs{ + input_tensor_shape, std::nullopt, std::nullopt, CreateGrad::YES}; + + LayerAddedResult inputs_layer = + add_layer(computation_graph, + LayerAttrs{ComputationGraphOpAttrs{InputAttrs{}}, "inputs"}, + {}, + {input_tensor_attrs}); + + float scalar = 4.0; + LayerAddedResult scalar_multiply_operator = + add_layer(computation_graph, + LayerAttrs{ComputationGraphOpAttrs{ElementUnaryAttrs{ + OperatorType::SCALAR_MULTIPLY, scalar}}, + "scalar_mult"}, + inputs_layer.outputs, + {input_tensor_attrs}); + tensor_guid_t label_tensor = scalar_multiply_operator.outputs.at(0); + + // initialize runtime configs + ManagedPerDeviceFFHandle managed_handle{}; + + RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ + DeviceSpecific::create(managed_handle.raw_handle()), + EnableProfiling::YES, + ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}}; + + // initialize training backing + LocalTrainingBacking local_training_backing = LocalTrainingBacking{ + allocator, allocated_tensors, computation_graph, runtime_arg_config}; + + SUBCASE("SparseCategoricalCrossEntropyLossAttrs") { + LossAttrs loss_attrs = LossAttrs{ + SparseCategoricalCrossEntropyLossAttrs{/*replace_labels=*/false}}; + + compute_loss(local_training_backing, + loss_attrs, + label_tensor, + label_for_sparse_cce_loss_attrs, + allocator); + } + + SUBCASE("NonconfigurableLossAttrs") { + SUBCASE("LossFunction::CATEGORICAL_CROSSENTROPY") { + LossAttrs loss_attrs = LossAttrs{ + NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}; + compute_loss(local_training_backing, + loss_attrs, + label_tensor, + label_for_nonconfigurable_loss_attrs, + allocator); + } + + SUBCASE("LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE") { + LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{ + LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}; + compute_loss(local_training_backing, + loss_attrs, + label_tensor, + label_for_nonconfigurable_loss_attrs, + allocator); + } + + SUBCASE("LossFunction::IDENTITY") { + LossAttrs loss_attrs = + LossAttrs{NonconfigurableLossAttrs{LossFunction::IDENTITY}}; + compute_loss(local_training_backing, + loss_attrs, + label_tensor, + label_for_nonconfigurable_loss_attrs, + allocator); + } + } + } +} + +} // namespace FlexFlow diff --git a/lib/local-execution/test/src/test_update.cc b/lib/local-execution/test/src/test_update.cc new file mode 100644 index 0000000000..3121d8e02b --- /dev/null +++ b/lib/local-execution/test/src/test_update.cc @@ -0,0 +1,115 @@ +#include "doctest/doctest.h" +#include "kernels/local_cuda_allocator.h" +#include "kernels/managed_ff_stream.h" +#include "kernels/managed_per_device_ff_handle.h" +#include "local-execution/allocated_tensors.h" +#include "local-execution/local_training_backing.h" +#include "pcg/computation_graph.h" +#include "pcg/computation_graph_builder.h" +#include "pcg/optimizer_attrs.dtg.h" +#include "test_utils.h" + +namespace FlexFlow { + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("Execute Update") { + Allocator allocator = create_local_cuda_memory_allocator(); + AllocatedTensors allocated_tensors = make_empty_allocated_tensors(); + + // construct computation graph + ComputationGraph computation_graph = make_empty_computation_graph(); + + nonnegative_int batch_size = 10_n; + nonnegative_int data_dim = 100_n; + + TensorShape input_tensor_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, data_dim}}, + DataType::FLOAT}; + + TensorAttrs input_tensor_attrs = TensorAttrs{ + input_tensor_shape, std::nullopt, std::nullopt, CreateGrad::YES}; + + LayerAddedResult inputs_layer = + add_layer(computation_graph, + LayerAttrs{ComputationGraphOpAttrs{InputAttrs{}}, "inputs"}, + {}, + {input_tensor_attrs}); + + float scalar = 4.0; + LayerAddedResult scalar_multiply_operator = + add_layer(computation_graph, + LayerAttrs{ComputationGraphOpAttrs{ElementUnaryAttrs{ + OperatorType::SCALAR_MULTIPLY, scalar}}, + "scalar_mult"}, + inputs_layer.outputs, + {input_tensor_attrs}); + + // initialize runtime configs + ManagedPerDeviceFFHandle managed_handle{}; + + RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ + DeviceSpecific::create(managed_handle.raw_handle()), + EnableProfiling::YES, + ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}}; + + SUBCASE("SGDOptimizerAttrs") { + SUBCASE("momentum=0") { + OptimizerAttrs optimizer_attrs = + OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, + /*momentum=*/0.0f, + /*nesterov=*/false, + /*weight_decay=*/0.001}}; + LocalTrainingBacking local_training_backing = + LocalTrainingBacking{allocator, + allocated_tensors, + computation_graph, + runtime_arg_config, + optimizer_attrs}; + execute_update(local_training_backing, + scalar_multiply_operator.layer, + optimizer_attrs, + allocator); + } + SUBCASE("momentum=0.9") { + OptimizerAttrs optimizer_attrs = + OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, + /*momentum=*/0.9, + /*nesterov=*/false, + /*weight_decay=*/0.001}}; + LocalTrainingBacking local_training_backing = + LocalTrainingBacking{allocator, + allocated_tensors, + computation_graph, + runtime_arg_config, + optimizer_attrs}; + execute_update(local_training_backing, + scalar_multiply_operator.layer, + optimizer_attrs, + allocator); + } + } + SUBCASE("AdamOptimizerAttrs") { + OptimizerAttrs optimizer_attrs = + OptimizerAttrs{AdamOptimizerAttrs{/*alpha=*/0.001, + /*beta1=*/0.9, + /*beta2=*/0.999, + /*weight_decay=*/0.001, + /*alpha_t=*/0.001, + /*beta_t=*/0.9, + /*beta2_t=*/0.999, + /*epsilon=*/1e-8}}; + LocalTrainingBacking local_training_backing = + LocalTrainingBacking{allocator, + allocated_tensors, + computation_graph, + runtime_arg_config, + optimizer_attrs}; + execute_update(local_training_backing, + scalar_multiply_operator.layer, + optimizer_attrs, + allocator); + } + } +} + +} // namespace FlexFlow diff --git a/lib/local-execution/test/src/test_update_e2e.cc b/lib/local-execution/test/src/test_update_e2e.cc deleted file mode 100644 index 4658a2a544..0000000000 --- a/lib/local-execution/test/src/test_update_e2e.cc +++ /dev/null @@ -1,93 +0,0 @@ -#include "doctest/doctest.h" -#include "kernels/local_cuda_allocator.h" -#include "kernels/managed_ff_stream.h" -#include "kernels/managed_per_device_ff_handle.h" -#include "local-execution/local_training_backing.h" - -#include "pcg/computation_graph.h" -#include "pcg/computation_graph_builder.h" -#include "pcg/optimizer_attrs.dtg.h" -#include "test_utils.h" - -namespace FlexFlow { - -TEST_SUITE(FF_CUDA_TEST_SUITE) { - TEST_CASE("Local Execution Update E2E") { - // initialize runtime configs - ManagedPerDeviceFFHandle managed_handle{}; - - RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ - DeviceSpecific::create(managed_handle.raw_handle()), - EnableProfiling::YES, - ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}}; - - // construct graph - ComputationGraphBuilder cg_builder; - - size_t batch_size = 10; - size_t data_dim = 100; - TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; - tensor_guid_t input_tensor = - cg_builder.create_input(input_shape, CreateGrad::YES); - - float scalar = 4.0; - std::string layer_name = "scalar_multiply"; - tensor_guid_t logit_tensor = - cg_builder.scalar_multiply(input_tensor, scalar, layer_name); - - // allocate memory - Allocator allocator = create_local_cuda_memory_allocator(); - LocalTrainingBacking local_backing(allocator, - cg_builder.computation_graph, - LayerTensorBackingMap{}, - TensorBackingMap{}, - runtime_arg_config); - // for (layer_guid_t const & node: - // topological_ordering(cg_builder.computation_graph)) { - // local_backing.register_and_allocate_layer(node); - // } - layer_guid_t layer_guid = - get_layer_by_name(cg_builder.computation_graph, layer_name); - local_backing.register_and_allocate_layer(layer_guid); - - SUBCASE("SGDOptimizerAttrs") { - SUBCASE("momentum=0") { - OptimizerAttrs optimizer_attrs = - OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, - /*momentum=*/0.0f, - /*nesterov=*/false, - /*weight_decay=*/0.001}}; - local_backing.allocate_layer_optimizer_tensors(layer_guid, - optimizer_attrs); - local_backing.execute_update(layer_guid, optimizer_attrs); - } - SUBCASE("momentum=0.9") { - OptimizerAttrs optimizer_attrs = - OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, - /*momentum=*/0.9, - /*nesterov=*/false, - /*weight_decay=*/0.001}}; - local_backing.allocate_layer_optimizer_tensors(layer_guid, - optimizer_attrs); - local_backing.execute_update(layer_guid, optimizer_attrs); - } - } - SUBCASE("AdamOptimizerAttrs") { - OptimizerAttrs optimizer_attrs = - OptimizerAttrs{AdamOptimizerAttrs{/*alpha=*/0.001, - /*beta1=*/0.9, - /*beta2=*/0.999, - /*weight_decay=*/0.001, - /*alpha_t=*/0.001, - /*beta_t=*/0.9, - /*beta2_t=*/0.999, - /*epsilon=*/1e-8}}; - local_backing.allocate_layer_optimizer_tensors(layer_guid, - optimizer_attrs); - local_backing.execute_update(layer_guid, optimizer_attrs); - } - } -} - -} // namespace FlexFlow From 350babf3584c3d99e76e4dc0f72a658aa0222afc Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Sun, 23 Feb 2025 19:11:42 -0800 Subject: [PATCH 50/91] Passing tests after merge issues --- lib/local-execution/src/allocated_tensors.cc | 5 +- .../src/local_cost_estimator.cc | 59 +++-- .../src/local_training_backing.cc | 2 +- lib/local-execution/src/optimizer.cc | 224 +++++++++++------- lib/local-execution/src/task_registry.cc | 2 +- .../src/unallocated_tensors.cc | 4 +- .../test/src/test_allocated_tensors.cc | 6 - .../test/src/test_local_cost_estimator.cc | 10 +- .../test/src/test_local_tensor_backing.cc | 4 - .../test/src/test_loss_functions.cc | 74 +++--- .../test/src/test_unallocated_tensors.cc | 6 - lib/local-execution/test/src/test_update.cc | 51 ++-- 12 files changed, 247 insertions(+), 200 deletions(-) diff --git a/lib/local-execution/src/allocated_tensors.cc b/lib/local-execution/src/allocated_tensors.cc index 2c40cc3b86..196da16ace 100644 --- a/lib/local-execution/src/allocated_tensors.cc +++ b/lib/local-execution/src/allocated_tensors.cc @@ -54,8 +54,7 @@ bool are_allocated_gradient_tensors_valid( for (std::pair const &tensor_to_grad : allocated_tensors.gradient_mapping) { if (tensor_attrs.count(tensor_to_grad.first)) { - if (tensor_attrs.at(tensor_to_grad.first).create_gradients == - CreateGrad::NO) { + if (tensor_attrs.at(tensor_to_grad.first).create_grad == CreateGrad::NO) { return false; } @@ -96,7 +95,7 @@ bool are_allocated_optimizer_tensors_valid( for (std::pair> const &tensor_to_optimizers : allocated_tensors.optimizer_mapping) { if (tensor_attrs.count(tensor_to_optimizers.first)) { - if (tensor_attrs.at(tensor_to_optimizers.first).create_gradients == + if (tensor_attrs.at(tensor_to_optimizers.first).create_grad == CreateGrad::NO) { return false; } diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc index 5c17f011e4..9828a67293 100644 --- a/lib/local-execution/src/local_cost_estimator.cc +++ b/lib/local-execution/src/local_cost_estimator.cc @@ -4,13 +4,13 @@ #include "local-execution/tracked_allocator.h" #include "op-attrs/computation_graph_op_attrs.h" #include "op-attrs/pcg_operator_attrs.h" -#include "pcg/computation_graph/layer_added_result.dtg.h" #include "pcg/computation_graph.h" +#include "pcg/computation_graph/layer_added_result.dtg.h" #include "pcg/machine_view.dtg.h" #include "pcg/parallel_tensor_attrs.h" #include "utils/containers/concat_vectors.h" +#include "utils/containers/get_only.h" #include "utils/containers/sum.h" -#include "pcg/parallel_tensor_attrs.h" #include "utils/containers/transform.h" #include "utils/containers/values.h" @@ -26,41 +26,36 @@ static ComputationGraph create_computation_graph_for_local_cost_estimation( std::vector const &outputs) { ComputationGraph computation_graph = make_empty_computation_graph(); - // create layer for inputs - auto get_vector_piece_attrs_from_parallel_tensor_shape = - [](std::vector const ¶llel_shapes) { - return transform(parallel_shapes, [](ParallelTensorShape const &p) { - return TensorAttrs{ - get_piece_shape(p), std::nullopt, std::nullopt, CreateGrad::YES}; - }); - }; - - LayerAddedResult inputs_layer = - add_layer(computation_graph, - LayerAttrs{ComputationGraphOpAttrs{InputAttrs{}}, "inputs"}, - {}, - get_vector_piece_attrs_from_parallel_tensor_shape(inputs)); - - // create layer for weights - auto get_vector_piece_attrs_from_parallel_tensor_attrs = - [](std::vector const ¶llel_attrs) { - return transform(parallel_attrs, [](ParallelTensorAttrs const &p) { - return get_piece_attrs(p); - }); - }; - - LayerAddedResult weights_layer = - add_layer(computation_graph, - LayerAttrs{ComputationGraphOpAttrs{InputAttrs{}}, "weights"}, - {}, - get_vector_piece_attrs_from_parallel_tensor_attrs(weights)); + std::vector input_tensors; + for (ParallelTensorShape const &input : inputs) { + LayerAddedResult inputs_layer = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{InputAttrs{get_piece_shape(input)}}, + std::nullopt}, + {}, + {}); + input_tensors.push_back(get_only(inputs_layer.outputs)); + } + + std::vector weight_tensors; + for (ParallelTensorAttrs const &weight : weights) { + LayerAddedResult weights_layer = + add_layer(computation_graph, + LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{ + get_piece_shape(weight.shape), + InitializerAttrs{ZeroInitializerAttrs{}}}}, + std::nullopt}, + {}, + {}); + weight_tensors.push_back(get_only(weights_layer.outputs)); + } // create operator layer LayerAddedResult operator_layer = add_layer( computation_graph, LayerAttrs{compgraph_op_attrs_from_pcg_op_attrs(op), "operator"}, - concat_vectors(inputs_layer.outputs, weights_layer.outputs), - get_vector_piece_attrs_from_parallel_tensor_attrs(outputs)); + input_tensors, + weight_tensors); return computation_graph; } diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index df15c707b2..77e62e52af 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -213,7 +213,7 @@ void execute_update(LocalTrainingBacking const &local_training_backing, Allocator &allocator) { LayerAttrs layer_attrs = get_layer_attrs(local_training_backing.computation_graph, node); - if (layer_attrs.attrs.has()) { + if (layer_attrs.op_attrs.has()) { // get tensors tensor_guid_t weight_tensor = get_only( get_outgoing_tensors(local_training_backing.computation_graph, node)); diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc index a69ae9da61..1b9ce83d14 100644 --- a/lib/local-execution/src/optimizer.cc +++ b/lib/local-execution/src/optimizer.cc @@ -1,6 +1,7 @@ #include "local-execution/optimizer.h" #include "kernels/optimizer_kernels.h" #include "task-spec/profiling.h" +#include "utils/containers/get_only.h" #include "utils/overload.h" namespace FlexFlow { @@ -24,9 +25,12 @@ TaskSignature get_sgd_update_signature() { add_arg_slot(sig, ATTRS); add_arg_slot(sig, PROFILING); - if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { - add_unchecked_arg_slot(sig, HANDLE); - } + add_unchecked_arg_slot( + sig, HANDLE); // how to deal with removal of ParamSync? + + // if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { + // add_unchecked_arg_slot(sig, HANDLE); + // } return sig; } @@ -44,12 +48,16 @@ TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs, b.bind_arg(ATTRS, attrs); b.bind_arg(PROFILING, profiling_settings()); - if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { - b.bind_arg(HANDLE, ff_handle()); - return TaskInvocation{task_id_t::SGD_UPD_NCCL_TASK_ID, b}; - } else { - return TaskInvocation{task_id_t::SGD_UPD_PS_TASK_ID, b}; - } + b.bind_arg(HANDLE, ff_handle()); + return TaskInvocation{task_id_t::SGD_UPD_NCCL_TASK_ID, + b}; // how to deal with removal of ParamSync? + + // if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { + // b.bind_arg(HANDLE, ff_handle()); + // return TaskInvocation{task_id_t::SGD_UPD_NCCL_TASK_ID, b}; + // } else { + // return TaskInvocation{task_id_t::SGD_UPD_PS_TASK_ID, b}; + // } } static void sgd_update_task_impl(TaskArgumentAccessor const &acc) { @@ -73,35 +81,49 @@ static void sgd_update_task_impl(TaskArgumentAccessor const &acc) { sgd_v_ptr = sgd_v.get_float_ptr(); } - if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { - auto handle = acc.get_argument(HANDLE); - profile(sgd_nccl_update_task_gpu, - profiling, - "[SGD NCCL] update_time = %.2lfms\n", - attrs.lr, - attrs.momentum, - attrs.nesterov, - attrs.weight_decay, - handle, - weight_grad.get_float_ptr(), - size, - weight.get_float_ptr(), - sgd_v_ptr); - - } else { - profile(sgd_ps_update_task_gpu, - profiling, - "[SGD PS] update_time = %.2lfms\n", - attrs.lr, - attrs.momentum, - attrs.nesterov, - attrs.weight_decay, - weight_grad.get_float_ptr(), - size, - num_replicas, - weight.get_float_ptr(), - sgd_v_ptr); - } + auto handle = acc.get_argument(HANDLE); + profile(sgd_nccl_update_task_gpu, + profiling, + "[SGD NCCL] update_time = %.2lfms\n", + attrs.lr, + attrs.momentum, + attrs.nesterov, + attrs.weight_decay, + handle, + weight_grad.get_float_ptr(), + size, + weight.get_float_ptr(), + sgd_v_ptr); // how to deal with removal of ParamSync? + + // if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { + // auto handle = acc.get_argument(HANDLE); + // profile(sgd_nccl_update_task_gpu, + // profiling, + // "[SGD NCCL] update_time = %.2lfms\n", + // attrs.lr, + // attrs.momentum, + // attrs.nesterov, + // attrs.weight_decay, + // handle, + // weight_grad.get_float_ptr(), + // size, + // weight.get_float_ptr(), + // sgd_v_ptr); + + // } else { + // profile(sgd_ps_update_task_gpu, + // profiling, + // "[SGD PS] update_time = %.2lfms\n", + // attrs.lr, + // attrs.momentum, + // attrs.nesterov, + // attrs.weight_decay, + // weight_grad.get_float_ptr(), + // size, + // num_replicas, + // weight.get_float_ptr(), + // sgd_v_ptr); + // } } TaskImplFunction get_sgd_update_task_impl() { @@ -117,9 +139,11 @@ TaskSignature get_adam_update_signature() { add_arg_slot(sig, ATTRS); add_arg_slot(sig, PROFILING); - if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { - add_unchecked_arg_slot(sig, HANDLE); - } + add_unchecked_arg_slot( + sig, HANDLE); // how to deal with removal of ParamSync? + // if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { + // add_unchecked_arg_slot(sig, HANDLE); + // } return sig; } @@ -135,13 +159,16 @@ TaskInvocation adam_update(AdamOptimizerAttrs const &attrs, b.bind_optimizer(ADAM_V, adam_v); b.bind_arg(ATTRS, attrs); b.bind_arg(PROFILING, profiling_settings()); + b.bind_arg(HANDLE, ff_handle()); + return TaskInvocation{task_id_t::ADAM_UPD_NCCL_TASK_ID, + b}; // how to deal with removal of ParamSync? - if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { - b.bind_arg(HANDLE, ff_handle()); - return TaskInvocation{task_id_t::ADAM_UPD_NCCL_TASK_ID, b}; - } else { - return TaskInvocation{task_id_t::ADAM_UPD_PS_TASK_ID, b}; - } + // if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { + // b.bind_arg(HANDLE, ff_handle()); + // return TaskInvocation{task_id_t::ADAM_UPD_NCCL_TASK_ID, b}; + // } else { + // return TaskInvocation{task_id_t::ADAM_UPD_PS_TASK_ID, b}; + // } } static void adam_update_task_impl(TaskArgumentAccessor const &acc) { @@ -162,38 +189,54 @@ static void adam_update_task_impl(TaskArgumentAccessor const &acc) { int num_replicas = weight_grad.shape.get_volume().unwrap_nonnegative() / weight.shape.get_volume().unwrap_nonnegative(); - if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { - auto handle = acc.get_argument(HANDLE); - profile(adam_nccl_update_task_gpu, - profiling, - "[Adam NCCL] update_time = %.2lfms\n", - attrs.alpha_t, - attrs.beta1, - attrs.beta2, - attrs.weight_decay, - attrs.epsilon, - size, - handle, - weight_grad.get_float_ptr(), - m_tensor.get_float_ptr(), - v_tensor.get_float_ptr(), - weight.get_float_ptr()); - } else { - profile(adam_ps_update_task_gpu, - profiling, - "[Adam NCCL] update_time = %.2lfms\n", - attrs.alpha_t, - attrs.beta1, - attrs.beta2, - attrs.weight_decay, - attrs.epsilon, - size, - num_replicas, - weight_grad.get_float_ptr(), - m_tensor.get_float_ptr(), - v_tensor.get_float_ptr(), - weight.get_float_ptr()); - } + auto handle = acc.get_argument(HANDLE); + profile(adam_nccl_update_task_gpu, + profiling, + "[Adam NCCL] update_time = %.2lfms\n", + attrs.alpha_t, + attrs.beta1, + attrs.beta2, + attrs.weight_decay, + attrs.epsilon, + size, + handle, + weight_grad.get_float_ptr(), + m_tensor.get_float_ptr(), + v_tensor.get_float_ptr(), + weight.get_float_ptr()); // how to deal with removal of ParamSync? + + // if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { + // auto handle = acc.get_argument(HANDLE); + // profile(adam_nccl_update_task_gpu, + // profiling, + // "[Adam NCCL] update_time = %.2lfms\n", + // attrs.alpha_t, + // attrs.beta1, + // attrs.beta2, + // attrs.weight_decay, + // attrs.epsilon, + // size, + // handle, + // weight_grad.get_float_ptr(), + // m_tensor.get_float_ptr(), + // v_tensor.get_float_ptr(), + // weight.get_float_ptr()); + // } else { + // profile(adam_ps_update_task_gpu, + // profiling, + // "[Adam NCCL] update_time = %.2lfms\n", + // attrs.alpha_t, + // attrs.beta1, + // attrs.beta2, + // attrs.weight_decay, + // attrs.epsilon, + // size, + // num_replicas, + // weight_grad.get_float_ptr(), + // m_tensor.get_float_ptr(), + // v_tensor.get_float_ptr(), + // weight.get_float_ptr()); + // } } TaskImplFunction get_adam_update_task_impl() { @@ -211,17 +254,18 @@ TaskInvocation get_update_invocation( tensor_guid_t const &weight, gradient_tensor_t const &weight_grad, std::vector const &grad_buffer_tensors) { - return attrs.visit(overload{ - [&](SGDOptimizerAttrs const &s) { - return sgd_update(s, weight, weight_grad, grad_buffer_tensors.at(0)); - }, - [&](AdamOptimizerAttrs const &s) { - return adam_update(s, - weight, - weight_grad, - grad_buffer_tensors.at(0), - grad_buffer_tensors.at(1)); - }}); + return attrs.visit( + overload{[&](SGDOptimizerAttrs const &s) { + return sgd_update( + s, weight, weight_grad, get_only(grad_buffer_tensors)); + }, + [&](AdamOptimizerAttrs const &s) { + return adam_update(s, + weight, + weight_grad, + grad_buffer_tensors.at(0), + grad_buffer_tensors.at(1)); + }}); } TaskImplFunction get_update_task_impl(OptimizerAttrs const &attrs) { diff --git a/lib/local-execution/src/task_registry.cc b/lib/local-execution/src/task_registry.cc index 487bd4420e..3d9dec1e26 100644 --- a/lib/local-execution/src/task_registry.cc +++ b/lib/local-execution/src/task_registry.cc @@ -19,7 +19,7 @@ TaskRegistry construct_task_registry( fwd_task_ids.insert({node, std::nullopt}); bwd_task_ids.insert({node, std::nullopt}); - ComputationGraphOpAttrs attrs = layer_attrs.second.attrs; + ComputationGraphOpAttrs attrs = layer_attrs.second.op_attrs; std::vector task_ids = get_task_ids(attrs); for (task_id_t const &task_id : task_ids) { diff --git a/lib/local-execution/src/unallocated_tensors.cc b/lib/local-execution/src/unallocated_tensors.cc index ea64a46051..363d1eedef 100644 --- a/lib/local-execution/src/unallocated_tensors.cc +++ b/lib/local-execution/src/unallocated_tensors.cc @@ -23,7 +23,7 @@ UnallocatedTensors generate_unallocated_tensors( tensor_type_shapes.insert({tensor_guid_type, tensor_attrs.shape}); } - if (tensor_attrs.create_gradients == CreateGrad::YES && + if (tensor_attrs.create_grad == CreateGrad::YES && !allocated_tensors.gradient_mapping.count(tensor_guid)) { gradient_tensor_t gradient_tensor = gradient_tensor_source.new_gradient_tensor(); @@ -61,7 +61,7 @@ UnallocatedTensors generate_unallocated_tensors_with_optimizer( tensor_attrs_mapping) { tensor_guid_t tensor_guid = tensor_guid_attrs.first; TensorAttrs tensor_attrs = tensor_guid_attrs.second; - if (tensor_attrs.create_gradients == CreateGrad::YES) { + if (tensor_attrs.create_grad == CreateGrad::YES) { std::vector optimizer_tensors; int num_optimizer_tensors_to_allocate = diff --git a/lib/local-execution/test/src/test_allocated_tensors.cc b/lib/local-execution/test/src/test_allocated_tensors.cc index 99abd538d5..45fc8e0a1c 100644 --- a/lib/local-execution/test/src/test_allocated_tensors.cc +++ b/lib/local-execution/test/src/test_allocated_tensors.cc @@ -31,20 +31,14 @@ TEST_SUITE(FF_TEST_SUITE) { TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{ TensorShape{TensorDims{FFOrdered{16_n, 10_n}}, DataType::FLOAT}, - std::nullopt, - std::nullopt, CreateGrad::NO}; TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{ TensorShape{TensorDims{FFOrdered{16_n, 20_n}}, DataType::FLOAT}, - std::nullopt, - std::nullopt, CreateGrad::NO}; TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{ TensorShape{TensorDims{FFOrdered{16_n, 30_n}}, DataType::FLOAT}, - std::nullopt, - std::nullopt, CreateGrad::YES}; GenericTensorAccessorW tensor_backing_1 = diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc index 7220d2a367..30682c9a48 100644 --- a/lib/local-execution/test/src/test_local_cost_estimator.cc +++ b/lib/local-execution/test/src/test_local_cost_estimator.cc @@ -50,18 +50,12 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { ParallelTensorShape weights_shape = throw_if_unexpected( get_weights_shape(attrs, inputs_shape, inputs_shape, inputs_shape)); ParallelTensorAttrs weight_attrs = - ParallelTensorAttrs{weights_shape, - /*sync_type=*/std::nullopt, - /*initializer=*/std::nullopt, - CreateGrad::YES}; + ParallelTensorAttrs{weights_shape, CreateGrad::YES}; ParallelTensorShape output_shape = throw_if_unexpected( get_output_shape(attrs, inputs_shape, inputs_shape, inputs_shape)); ParallelTensorAttrs output_attrs = - ParallelTensorAttrs{output_shape, - /*sync_type=*/std::nullopt, - /*initializer=*/std::nullopt, - CreateGrad::YES}; + ParallelTensorAttrs{output_shape, CreateGrad::YES}; CostDetails result = cost_estimator.estimate_cost( PCGOperatorAttrs{attrs}, diff --git a/lib/local-execution/test/src/test_local_tensor_backing.cc b/lib/local-execution/test/src/test_local_tensor_backing.cc index 083b677e18..594051c2f1 100644 --- a/lib/local-execution/test/src/test_local_tensor_backing.cc +++ b/lib/local-execution/test/src/test_local_tensor_backing.cc @@ -96,14 +96,10 @@ TEST_SUITE(FF_TEST_SUITE) { TensorAttrs allocated_tensor_attrs = TensorAttrs{ TensorShape{TensorDims{FFOrdered{16_n, 10_n}}, DataType::FLOAT}, - std::nullopt, - std::nullopt, CreateGrad::NO}; TensorAttrs unallocated_tensor_attrs = TensorAttrs{ TensorShape{TensorDims{FFOrdered{16_n, 20_n}}, DataType::FLOAT}, - std::nullopt, - std::nullopt, CreateGrad::YES}; GenericTensorAccessorW allocated_tensor_backing = diff --git a/lib/local-execution/test/src/test_loss_functions.cc b/lib/local-execution/test/src/test_loss_functions.cc index c0386a4171..bb3e83cc4d 100644 --- a/lib/local-execution/test/src/test_loss_functions.cc +++ b/lib/local-execution/test/src/test_loss_functions.cc @@ -9,6 +9,7 @@ #include "pcg/computation_graph_builder.h" #include "pcg/optimizer_attrs.dtg.h" #include "test_utils.h" +#include "utils/containers/get_only.h" namespace FlexFlow { @@ -24,19 +25,20 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { loss_tensor_source.new_loss_tensor(); nonnegative_int batch_size = 10_n; - nonnegative_int data_dim = 100_n; + nonnegative_int data_dim = 16_n; + nonnegative_int output_dim = 32_n; - TensorShape input_tensor_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, data_dim}}, + TensorShape output_tensor_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT}; - TensorShape reduced_input_tensor_shape = + TensorShape reduced_tensor_shape = TensorShape{TensorDims{FFOrdered{batch_size, 1_n}}, DataType::FLOAT}; GenericTensorAccessorW label_for_nonconfigurable_loss_attrs_backing = - allocator.allocate_tensor(reduced_input_tensor_shape); + allocator.allocate_tensor(output_tensor_shape); GenericTensorAccessorW label_for_sparse_cce_loss_attrs_backing = - allocator.allocate_tensor(reduced_input_tensor_shape); + allocator.allocate_tensor(reduced_tensor_shape); AllocatedTensors allocated_tensors = AllocatedTensors{ {{TensorTypeVariant{label_for_nonconfigurable_loss_attrs}, label_for_nonconfigurable_loss_attrs_backing}, @@ -48,24 +50,40 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { // construct computation graph ComputationGraph computation_graph = make_empty_computation_graph(); - TensorAttrs input_tensor_attrs = TensorAttrs{ - input_tensor_shape, std::nullopt, std::nullopt, CreateGrad::YES}; - - LayerAddedResult inputs_layer = - add_layer(computation_graph, - LayerAttrs{ComputationGraphOpAttrs{InputAttrs{}}, "inputs"}, - {}, - {input_tensor_attrs}); - - float scalar = 4.0; - LayerAddedResult scalar_multiply_operator = - add_layer(computation_graph, - LayerAttrs{ComputationGraphOpAttrs{ElementUnaryAttrs{ - OperatorType::SCALAR_MULTIPLY, scalar}}, - "scalar_mult"}, - inputs_layer.outputs, - {input_tensor_attrs}); - tensor_guid_t label_tensor = scalar_multiply_operator.outputs.at(0); + TensorShape input_tensor_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, data_dim}}, + DataType::FLOAT}; + + TensorShape weight_shape = TensorShape{ + TensorDims{FFOrdered{data_dim, output_dim}}, + DataType::FLOAT}; + + LayerAddedResult inputs_layer = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{InputAttrs{input_tensor_shape}}, + "inputs"}, + {}, + {}); + + LayerAddedResult weights_layer = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{ + weight_shape, InitializerAttrs{ZeroInitializerAttrs{}}}}, + "weights"}, + {}, + {}); + + LayerAddedResult linear_operator = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim, + /*use_bias=*/true, + DataType::FLOAT, + std::nullopt, + std::nullopt}}, + "linear"}, + inputs_layer.outputs, + {}); + tensor_guid_t logit_tensor = get_only(linear_operator.outputs); // initialize runtime configs ManagedPerDeviceFFHandle managed_handle{}; @@ -85,7 +103,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { compute_loss(local_training_backing, loss_attrs, - label_tensor, + logit_tensor, label_for_sparse_cce_loss_attrs, allocator); } @@ -96,7 +114,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}; compute_loss(local_training_backing, loss_attrs, - label_tensor, + logit_tensor, label_for_nonconfigurable_loss_attrs, allocator); } @@ -106,7 +124,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}; compute_loss(local_training_backing, loss_attrs, - label_tensor, + logit_tensor, label_for_nonconfigurable_loss_attrs, allocator); } @@ -116,7 +134,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { LossAttrs{NonconfigurableLossAttrs{LossFunction::IDENTITY}}; compute_loss(local_training_backing, loss_attrs, - label_tensor, + logit_tensor, label_for_nonconfigurable_loss_attrs, allocator); } diff --git a/lib/local-execution/test/src/test_unallocated_tensors.cc b/lib/local-execution/test/src/test_unallocated_tensors.cc index 662e7b1878..82f5a132fe 100644 --- a/lib/local-execution/test/src/test_unallocated_tensors.cc +++ b/lib/local-execution/test/src/test_unallocated_tensors.cc @@ -40,20 +40,14 @@ TEST_SUITE(FF_TEST_SUITE) { TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{ TensorShape{TensorDims{FFOrdered{16_n, 10_n}}, DataType::FLOAT}, - std::nullopt, - std::nullopt, CreateGrad::NO}; TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{ TensorShape{TensorDims{FFOrdered{16_n, 20_n}}, DataType::FLOAT}, - std::nullopt, - std::nullopt, CreateGrad::NO}; TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{ TensorShape{TensorDims{FFOrdered{16_n, 30_n}}, DataType::FLOAT}, - std::nullopt, - std::nullopt, CreateGrad::YES}; GenericTensorAccessorW tensor_backing_1 = diff --git a/lib/local-execution/test/src/test_update.cc b/lib/local-execution/test/src/test_update.cc index 3121d8e02b..d6108635af 100644 --- a/lib/local-execution/test/src/test_update.cc +++ b/lib/local-execution/test/src/test_update.cc @@ -20,29 +20,42 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { ComputationGraph computation_graph = make_empty_computation_graph(); nonnegative_int batch_size = 10_n; - nonnegative_int data_dim = 100_n; + nonnegative_int data_dim = 16_n; + nonnegative_int output_dim = 32_n; TensorShape input_tensor_shape = TensorShape{ TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; - TensorAttrs input_tensor_attrs = TensorAttrs{ - input_tensor_shape, std::nullopt, std::nullopt, CreateGrad::YES}; + TensorShape weight_shape = TensorShape{ + TensorDims{FFOrdered{data_dim, output_dim}}, + DataType::FLOAT}; + + LayerAddedResult inputs_layer = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{InputAttrs{input_tensor_shape}}, + "inputs"}, + {}, + {}); - LayerAddedResult inputs_layer = - add_layer(computation_graph, - LayerAttrs{ComputationGraphOpAttrs{InputAttrs{}}, "inputs"}, - {}, - {input_tensor_attrs}); + LayerAddedResult weights_layer = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{ + weight_shape, InitializerAttrs{ZeroInitializerAttrs{}}}}, + "weights"}, + {}, + {}); - float scalar = 4.0; - LayerAddedResult scalar_multiply_operator = - add_layer(computation_graph, - LayerAttrs{ComputationGraphOpAttrs{ElementUnaryAttrs{ - OperatorType::SCALAR_MULTIPLY, scalar}}, - "scalar_mult"}, - inputs_layer.outputs, - {input_tensor_attrs}); + LayerAddedResult linear_operator = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim, + /*use_bias=*/true, + DataType::FLOAT, + std::nullopt, + std::nullopt}}, + "linear"}, + inputs_layer.outputs, + {}); // initialize runtime configs ManagedPerDeviceFFHandle managed_handle{}; @@ -66,7 +79,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { runtime_arg_config, optimizer_attrs}; execute_update(local_training_backing, - scalar_multiply_operator.layer, + linear_operator.layer, optimizer_attrs, allocator); } @@ -83,7 +96,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { runtime_arg_config, optimizer_attrs}; execute_update(local_training_backing, - scalar_multiply_operator.layer, + linear_operator.layer, optimizer_attrs, allocator); } @@ -105,7 +118,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { runtime_arg_config, optimizer_attrs}; execute_update(local_training_backing, - scalar_multiply_operator.layer, + linear_operator.layer, optimizer_attrs, allocator); } From aef7c6e3c3087f15b4c90792148f170da84f6f7c Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Tue, 25 Feb 2025 06:32:27 -0800 Subject: [PATCH 51/91] Pass gpu tests --- lib/kernels/src/array_shape.cc | 6 +-- lib/kernels/src/legion_dim.cc | 6 --- .../local-execution/local_training_backing.h | 11 ++--- .../src/local_cost_estimator.cc | 3 ++ .../src/local_training_backing.cc | 28 ++++++----- lib/local-execution/src/task_registry.cc | 1 + lib/local-execution/test/CMakeLists.txt | 7 +++ .../test/modify_test_commands.cmake | 21 ++++++++ .../test/src/test_loss_functions.cc | 40 +++++++-------- lib/local-execution/test/src/test_update.cc | 36 ++++++++------ .../include/task-spec/task_arg_spec.h | 12 +++++ lib/task-spec/src/op_task_invocation.cc | 49 +++++++++---------- lib/task-spec/src/task_arg_spec.cc | 11 +++++ lib/task-spec/src/task_invocation.cc | 33 ++++++++++++- 14 files changed, 173 insertions(+), 91 deletions(-) create mode 100644 lib/local-execution/test/modify_test_commands.cmake create mode 100644 lib/task-spec/include/task-spec/task_arg_spec.h create mode 100644 lib/task-spec/src/task_arg_spec.cc diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc index 521b15e435..f4011af79f 100644 --- a/lib/kernels/src/array_shape.cc +++ b/lib/kernels/src/array_shape.cc @@ -64,16 +64,16 @@ ArrayShape ArrayShape::sub_shape(std::optional start, ArrayShape ArrayShape::sub_shape(std::optional start, std::optional end) const { - std::optional legion_start = + std::optional ff_end = transform(start, [&](auto const &start_unwrapped) { return ff_dim_from_legion_dim(start_unwrapped, num_dims()); }); - std::optional legion_end = + std::optional ff_start = transform(end, [&](auto const &end_unwrapped) { return ff_dim_from_legion_dim(end_unwrapped, num_dims()); }); - return this->sub_shape(legion_start, legion_end); + return this->sub_shape(ff_start, ff_end); } bool ArrayShape::operator==(ArrayShape const &other) const { diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/legion_dim.cc index 49b028f227..f373cf0410 100644 --- a/lib/kernels/src/legion_dim.cc +++ b/lib/kernels/src/legion_dim.cc @@ -13,12 +13,6 @@ legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim, ff_dim.value.unwrap_nonnegative() - 1}}; } -ff_dim_t legion_dim_from_ff_dim(legion_dim_t legion_dim, - nonnegative_int num_dimensions) { - return ff_dim_t{nonnegative_int{num_dimensions.unwrap_nonnegative() - - legion_dim.value.unwrap_nonnegative() - 1}}; -} - ff_dim_t ff_dim_from_legion_dim(legion_dim_t legion_dim, nonnegative_int num_dimensions) { return ff_dim_t{nonnegative_int{num_dimensions.unwrap_nonnegative() - diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h index 8c2bb34130..addac74633 100644 --- a/lib/local-execution/include/local-execution/local_training_backing.h +++ b/lib/local-execution/include/local-execution/local_training_backing.h @@ -15,24 +15,23 @@ namespace FlexFlow { struct LocalTrainingBacking { LocalTrainingBacking(Allocator &, AllocatedTensors const &, + GradientTensorSource &, ComputationGraph const &, RuntimeArgConfig const &); LocalTrainingBacking(Allocator &, AllocatedTensors const &, + GradientTensorSource &, + OptimizerTensorSource &, ComputationGraph const &, RuntimeArgConfig const &, OptimizerAttrs const &); public: - LocalTensorBacking local_tensor_backing; - LocalArgsBacking local_args_backing; - ComputationGraph computation_graph; TaskRegistry task_registry; - - GradientTensorSource gradient_tensor_source; - OptimizerTensorSource optimizer_tensor_source; + LocalTensorBacking local_tensor_backing; + LocalArgsBacking local_args_backing; }; LocalArgsBacking initialize_args_backing(TaskRegistry const &, diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc index 9828a67293..532fcc91c2 100644 --- a/lib/local-execution/src/local_cost_estimator.cc +++ b/lib/local-execution/src/local_cost_estimator.cc @@ -82,8 +82,11 @@ CostDetails LocalCostEstimator::estimate_cost( std::make_shared(create_local_cuda_memory_allocator()); Allocator allocator = Allocator(tracked_allocator_ptr); + GradientTensorSource gradient_tensor_source; + LocalTrainingBacking local_backing(allocator, AllocatedTensors{{}, {}, {}}, + gradient_tensor_source, computation_graph, this->runtime_arg_config); // execute layer diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index 77e62e52af..b2e0a2fb7e 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -18,20 +18,20 @@ namespace FlexFlow { LocalTrainingBacking::LocalTrainingBacking( Allocator &allocator, AllocatedTensors const &allocated_tensors, + GradientTensorSource &gradient_tensor_source, ComputationGraph const &computation_graph, RuntimeArgConfig const &runtime_arg_config) : computation_graph(computation_graph), - task_registry(construct_task_registry( - get_layer_attrs_mapping(this->computation_graph))), + task_registry( + construct_task_registry(get_layer_attrs_mapping(computation_graph))), local_tensor_backing(construct_local_tensor_backing( allocated_tensors, - generate_unallocated_tensors( - allocated_tensors, - get_all_tensor_attrs(this->computation_graph), - this->gradient_tensor_source), + generate_unallocated_tensors(allocated_tensors, + get_all_tensor_attrs(computation_graph), + gradient_tensor_source), allocator)), local_args_backing(initialize_args_backing(this->task_registry, - this->computation_graph, + computation_graph, runtime_arg_config, this->local_tensor_backing, allocator)){}; @@ -39,23 +39,25 @@ LocalTrainingBacking::LocalTrainingBacking( LocalTrainingBacking::LocalTrainingBacking( Allocator &allocator, AllocatedTensors const &allocated_tensors, + GradientTensorSource &gradient_tensor_source, + OptimizerTensorSource &optimizer_tensor_source, ComputationGraph const &computation_graph, RuntimeArgConfig const &runtime_arg_config, OptimizerAttrs const &optimizer_attrs) : computation_graph(computation_graph), - task_registry(construct_task_registry( - get_layer_attrs_mapping(this->computation_graph))), + task_registry( + construct_task_registry(get_layer_attrs_mapping(computation_graph))), local_tensor_backing(construct_local_tensor_backing( allocated_tensors, generate_unallocated_tensors_with_optimizer( allocated_tensors, - get_all_tensor_attrs(this->computation_graph), - this->gradient_tensor_source, - this->optimizer_tensor_source, + get_all_tensor_attrs(computation_graph), + gradient_tensor_source, + optimizer_tensor_source, optimizer_attrs), allocator)), local_args_backing(initialize_args_backing(this->task_registry, - this->computation_graph, + computation_graph, runtime_arg_config, this->local_tensor_backing, allocator)){}; diff --git a/lib/local-execution/src/task_registry.cc b/lib/local-execution/src/task_registry.cc index 3d9dec1e26..2787342a5f 100644 --- a/lib/local-execution/src/task_registry.cc +++ b/lib/local-execution/src/task_registry.cc @@ -71,6 +71,7 @@ bool registry_contains_task_for_layer(TaskRegistry const &task_registry, fmt::format("Invalid OpTaskType, got {}", op_task_type)); } + assert(task_ids.count(op)); return task_ids.at(op).has_value(); } diff --git a/lib/local-execution/test/CMakeLists.txt b/lib/local-execution/test/CMakeLists.txt index 930ab5c4e2..a973c6967b 100644 --- a/lib/local-execution/test/CMakeLists.txt +++ b/lib/local-execution/test/CMakeLists.txt @@ -12,3 +12,10 @@ ff_add_test_executable( kernels op-attrs ) + +set(FF_TEST_EXEC_NAME "local-execution-tests") +add_custom_command( + TARGET ${FF_TEST_EXEC_NAME} POST_BUILD + COMMAND ${CMAKE_COMMAND} -DFF_TEST_EXEC_NAME=${FF_TEST_EXEC_NAME} -P ${CMAKE_CURRENT_LIST_DIR}/modify_test_commands.cmake + DEPENDS ${FF_TEST_EXEC_NAME} +) diff --git a/lib/local-execution/test/modify_test_commands.cmake b/lib/local-execution/test/modify_test_commands.cmake new file mode 100644 index 0000000000..6494ae2d78 --- /dev/null +++ b/lib/local-execution/test/modify_test_commands.cmake @@ -0,0 +1,21 @@ +# modify_test_commands.cmake + +file(GLOB ctest_tests_files "${CMAKE_CURRENT_BINARY_DIR}/${FF_TEST_EXEC_NAME}_tests-*.cmake") + +foreach(ctest_tests_file IN LISTS ctest_tests_files) + file(READ "${ctest_tests_file}" content) + + # add nix run prefix + string(REGEX REPLACE + "add_test\\([ \t\r\n]*\\[==\\[([^]]+)\\]==\\][ \t\r\n]+([^ ]+)[ \t\r\n]+\\[==\\[([^]]+)\\]==\\]\\)" + "add_test( [==[\\1]==] nixGL -- \\2 [==[\\3]==])" + content "${content}") + + # add environment + # string(REGEX REPLACE + # "set_tests_properties\\([ \t\r\n]*\\[==\\[([^]]+)\\]==\\][ \t\r\n]+PROPERTIES[ \t\r\n]+([^)]+)\\)" + # "set_tests_properties( [==[\\1]==] PROPERTIES \\2 ENVIRONMENT \"NIXPKGS_ALLOW_UNFREE=1\")" + # content "${content}") + + file(WRITE "${ctest_tests_file}" "${content}") +endforeach() diff --git a/lib/local-execution/test/src/test_loss_functions.cc b/lib/local-execution/test/src/test_loss_functions.cc index bb3e83cc4d..2bf138e204 100644 --- a/lib/local-execution/test/src/test_loss_functions.cc +++ b/lib/local-execution/test/src/test_loss_functions.cc @@ -11,10 +11,14 @@ #include "test_utils.h" #include "utils/containers/get_only.h" -namespace FlexFlow { +using namespace ::FlexFlow; -TEST_SUITE(FF_CUDA_TEST_SUITE) { +TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Loss Functions") { + // initialize runtime + ManagedFFStream managed_stream{}; + ManagedPerDeviceFFHandle managed_handle{}; + Allocator allocator = create_local_cuda_memory_allocator(); // allocate label tensors @@ -58,44 +62,42 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { TensorDims{FFOrdered{data_dim, output_dim}}, DataType::FLOAT}; - LayerAddedResult inputs_layer = add_layer( - computation_graph, - LayerAttrs{ComputationGraphOpAttrs{InputAttrs{input_tensor_shape}}, - "inputs"}, - {}, - {}); + LayerAddedResult inputs_layer = + add_input_layer(computation_graph, input_tensor_shape); LayerAddedResult weights_layer = add_layer( computation_graph, LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{ weight_shape, InitializerAttrs{ZeroInitializerAttrs{}}}}, - "weights"}, + std::nullopt}, {}, {}); LayerAddedResult linear_operator = add_layer( computation_graph, LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim, - /*use_bias=*/true, + /*use_bias=*/false, DataType::FLOAT, - std::nullopt, + Activation::RELU, std::nullopt}}, - "linear"}, + std::nullopt}, inputs_layer.outputs, - {}); + weights_layer.outputs); tensor_guid_t logit_tensor = get_only(linear_operator.outputs); - // initialize runtime configs - ManagedPerDeviceFFHandle managed_handle{}; - RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ DeviceSpecific::create(managed_handle.raw_handle()), EnableProfiling::YES, ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}}; // initialize training backing - LocalTrainingBacking local_training_backing = LocalTrainingBacking{ - allocator, allocated_tensors, computation_graph, runtime_arg_config}; + GradientTensorSource gradient_tensor_source; + LocalTrainingBacking local_training_backing = + LocalTrainingBacking{allocator, + allocated_tensors, + gradient_tensor_source, + computation_graph, + runtime_arg_config}; SUBCASE("SparseCategoricalCrossEntropyLossAttrs") { LossAttrs loss_attrs = LossAttrs{ @@ -141,5 +143,3 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { } } } - -} // namespace FlexFlow diff --git a/lib/local-execution/test/src/test_update.cc b/lib/local-execution/test/src/test_update.cc index d6108635af..1f8684f38a 100644 --- a/lib/local-execution/test/src/test_update.cc +++ b/lib/local-execution/test/src/test_update.cc @@ -9,10 +9,14 @@ #include "pcg/optimizer_attrs.dtg.h" #include "test_utils.h" -namespace FlexFlow { +using namespace ::FlexFlow; -TEST_SUITE(FF_CUDA_TEST_SUITE) { +TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Execute Update") { + // initialize runtime configs + ManagedFFStream managed_stream{}; + ManagedPerDeviceFFHandle managed_handle{}; + Allocator allocator = create_local_cuda_memory_allocator(); AllocatedTensors allocated_tensors = make_empty_allocated_tensors(); @@ -31,12 +35,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { TensorDims{FFOrdered{data_dim, output_dim}}, DataType::FLOAT}; - LayerAddedResult inputs_layer = add_layer( - computation_graph, - LayerAttrs{ComputationGraphOpAttrs{InputAttrs{input_tensor_shape}}, - "inputs"}, - {}, - {}); + LayerAddedResult inputs_layer = + add_input_layer(computation_graph, input_tensor_shape); LayerAddedResult weights_layer = add_layer( computation_graph, @@ -49,22 +49,22 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { LayerAddedResult linear_operator = add_layer( computation_graph, LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim, - /*use_bias=*/true, + /*use_bias=*/false, DataType::FLOAT, - std::nullopt, + Activation::RELU, std::nullopt}}, "linear"}, inputs_layer.outputs, - {}); - - // initialize runtime configs - ManagedPerDeviceFFHandle managed_handle{}; + weights_layer.outputs); RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ DeviceSpecific::create(managed_handle.raw_handle()), EnableProfiling::YES, ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}}; + GradientTensorSource gradient_tensor_source; + OptimizerTensorSource optimizer_tensor_source; + SUBCASE("SGDOptimizerAttrs") { SUBCASE("momentum=0") { OptimizerAttrs optimizer_attrs = @@ -75,6 +75,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { LocalTrainingBacking local_training_backing = LocalTrainingBacking{allocator, allocated_tensors, + gradient_tensor_source, + optimizer_tensor_source, computation_graph, runtime_arg_config, optimizer_attrs}; @@ -92,6 +94,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { LocalTrainingBacking local_training_backing = LocalTrainingBacking{allocator, allocated_tensors, + gradient_tensor_source, + optimizer_tensor_source, computation_graph, runtime_arg_config, optimizer_attrs}; @@ -114,6 +118,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { LocalTrainingBacking local_training_backing = LocalTrainingBacking{allocator, allocated_tensors, + gradient_tensor_source, + optimizer_tensor_source, computation_graph, runtime_arg_config, optimizer_attrs}; @@ -124,5 +130,3 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { } } } - -} // namespace FlexFlow diff --git a/lib/task-spec/include/task-spec/task_arg_spec.h b/lib/task-spec/include/task-spec/task_arg_spec.h new file mode 100644 index 0000000000..38879ecab9 --- /dev/null +++ b/lib/task-spec/include/task-spec/task_arg_spec.h @@ -0,0 +1,12 @@ +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TASK_ARG_SPEC_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TASK_ARG_SPEC_H + +#include "task-spec/task_arg_spec.dtg.h" + +namespace FlexFlow { + +std::type_index get_type_index(TaskArgSpec const &); + +} + +#endif diff --git a/lib/task-spec/src/op_task_invocation.cc b/lib/task-spec/src/op_task_invocation.cc index d495dd9f92..a55995920a 100644 --- a/lib/task-spec/src/op_task_invocation.cc +++ b/lib/task-spec/src/op_task_invocation.cc @@ -79,43 +79,40 @@ OpTaskBinding infer_bwd_binding(OpTaskBinding const &fwd) { return bwd; } -bool is_op_tensor_spec_invalid(OpTensorSlotSpec const &tensor_slot_spec, - OpTensorSpec const &tensor_spec) { - return tensor_spec.role != tensor_slot_spec.tensor_role || - tensor_spec.slot_option != tensor_slot_spec.slot_option; -} - bool is_tensor_invocation_valid(OpTaskSignature const &sig, OpTaskInvocation const &inv) { - auto tensor_bindings = inv.binding.get_tensor_bindings(); - for (OpTensorSlotSpec const &op_tensor_slot_spec : sig.get_tensor_slots()) { - SlotGradId tensor_key = - SlotGradId{op_tensor_slot_spec.name, op_tensor_slot_spec.is_grad}; - OpTensorSpec op_tensor_spec = tensor_bindings.at(tensor_key); - if (is_op_tensor_spec_invalid(op_tensor_slot_spec, op_tensor_spec)) { + // TODO: fix for variadic inputs (need to implement .bind() for variadic + // first) + for (std::pair const &tensor_binding : + inv.binding.get_tensor_bindings()) { + OpTensorSlotSpec op_tensor_slot_spec = + OpTensorSlotSpec{tensor_binding.first.slot_id, + SlotType::TENSOR, + tensor_binding.second.role, + tensor_binding.first.is_grad, + tensor_binding.second.slot_option}; + + if (!sig.get_tensor_slots().count(op_tensor_slot_spec)) { return false; } } - // FIXME -- make sure invocation doesn't contain MORE than signature - // https://github.com/flexflow/FlexFlow/issues/1442 return true; } -bool is_arg_type_invalid(std::type_index expected_arg_type, - OpArgSpec op_arg_spec) { - std::type_index arg_spec_type = get_op_arg_spec_type_index(op_arg_spec); - return arg_spec_type != expected_arg_type; -} - bool is_arg_invocation_valid(OpTaskSignature const &sig, OpTaskInvocation const &inv) { - // FIXME -- arg signature/invocation checking - // https://github.com/flexflow/FlexFlow/issues/1442 - // auto sig_arg_types = sig.get_arg_types(); - // for (auto arg_binding : inv.binding.get_arg_bindings()) { - // std::type_index arg_type = sig_arg_types.at(arg_binding.first); - // assert (!is_arg_type_invalid(arg_type, arg_binding.second)); + // TODO: fix for device specific args + // for (std::pair const & arg_binding : + // inv.binding.get_arg_bindings()) { + // if (sig.get_arg_types().count(arg_binding.first)) { + // if (get_op_arg_spec_type_index(arg_binding.second) != + // sig.get_arg_types().at(arg_binding.first)) { + // return false; + // } + // } else { + // return false; + // } // } return true; diff --git a/lib/task-spec/src/task_arg_spec.cc b/lib/task-spec/src/task_arg_spec.cc new file mode 100644 index 0000000000..36fa2f71fd --- /dev/null +++ b/lib/task-spec/src/task_arg_spec.cc @@ -0,0 +1,11 @@ +#include "task-spec/task_arg_spec.h" +#include "utils/overload.h" + +namespace FlexFlow { + +std::type_index get_type_index(TaskArgSpec const &task_arg_spec) { + return task_arg_spec.visit( + overload{[](auto const &e) { return e.get_type_index(); }}); +} + +} // namespace FlexFlow diff --git a/lib/task-spec/src/task_invocation.cc b/lib/task-spec/src/task_invocation.cc index 4ba97f26de..e182231bda 100644 --- a/lib/task-spec/src/task_invocation.cc +++ b/lib/task-spec/src/task_invocation.cc @@ -1,9 +1,40 @@ #include "task-spec/task_invocation.h" +#include "task-spec/task_arg_spec.h" +#include "utils/containers/keys.h" namespace FlexFlow { bool is_invocation_valid(TaskSignature const &sig, TaskInvocation const &inv) { - NOT_IMPLEMENTED(); + TaskBinding binding = inv.binding; + + // args + for (std::pair const &arg_binding : + binding.get_arg_bindings()) { + if (sig.task_arg_types.count(arg_binding.first)) { + if (get_type_index(arg_binding.second) != + sig.task_arg_types.at(arg_binding.first)) { + return false; // incorrect arg type + } + } else { + return false; // slot doesn't exist in signature + } + } + + // tensors + for (std::pair const &tensor_binding : + binding.get_tensor_bindings()) { + slot_id_t tensor_slot_id = tensor_binding.first.slot_id; + if (sig.tensor_guid_slots.count(tensor_slot_id)) { + if (tensor_binding.first.tensor_type == + sig.tensor_guid_slots.at(tensor_slot_id).tensor_type) { + return false; // incorrect tensor type + } + } else { + return false; // slot doesn't exist in signature + } + } + + return true; } } // namespace FlexFlow From 6c84fb3feb79463d8ebe37c58833403a9b4a8b75 Mon Sep 17 00:00:00 2001 From: fruitea Date: Wed, 26 Feb 2025 05:19:17 -0800 Subject: [PATCH 52/91] chore: fix typo --- lib/realm-backend/include/realm-backend/driver.h | 2 +- .../include/realm-backend/model_training_instance.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/realm-backend/include/realm-backend/driver.h b/lib/realm-backend/include/realm-backend/driver.h index 77272c36ad..884b97a23d 100644 --- a/lib/realm-backend/include/realm-backend/driver.h +++ b/lib/realm-backend/include/realm-backend/driver.h @@ -3,7 +3,7 @@ #include "realm.h" #include "realm/cmdline.h" -#include "local-execution/task_invocation.h" +#include "task-spec/op_task_invocation.h" void top_level_task(const void *args, size_t arglen, const void *userdata, size_t userlen, Realm::Processor p); diff --git a/lib/realm-backend/include/realm-backend/model_training_instance.h b/lib/realm-backend/include/realm-backend/model_training_instance.h index a35cada2d2..62d8311ccb 100644 --- a/lib/realm-backend/include/realm-backend/model_training_instance.h +++ b/lib/realm-backend/include/realm-backend/model_training_instance.h @@ -4,7 +4,7 @@ #include "realm-backend/realm_training_backing.h" #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" #include "pcg/tensor_guid_t.dtg.h" -#include "local-execution/loss_tensor_t.dtg.h" +#include "task-spec/loss_tensor_t.dtg.h" namespace FlexFlow { From d6aa7ad7511f43ef5270901c8fe37d34c16ddd52 Mon Sep 17 00:00:00 2001 From: fruitea Date: Wed, 26 Feb 2025 18:12:09 -0800 Subject: [PATCH 53/91] chore: update realm allocator impl --- .../include/realm-backend/allocated_tensors.h | 30 ---- .../allocated_tensors.struct.toml | 32 ---- .../realm-backend/model_training_instance.h | 2 +- .../include/realm-backend/realm_allocator.h | 37 +---- .../realm_task_argument_accessor.h | 4 +- .../realm-backend/realm_tensor_backing.h | 12 +- .../realm-backend/realm_training_backing.h | 17 +- .../unallocated_tensors.struct.toml | 31 ---- lib/realm-backend/src/allocated_tensors.cc | 2 +- lib/realm-backend/src/realm_allocator.cc | 29 +--- .../src/realm_tensor_backing copy.cc | 142 ---------------- lib/realm-backend/src/realm_tensor_backing.cc | 22 +-- .../src/realm_training_backing.cc | 157 +++++++----------- 13 files changed, 99 insertions(+), 418 deletions(-) delete mode 100644 lib/realm-backend/include/realm-backend/allocated_tensors.h delete mode 100644 lib/realm-backend/include/realm-backend/allocated_tensors.struct.toml delete mode 100644 lib/realm-backend/include/realm-backend/unallocated_tensors.struct.toml delete mode 100644 lib/realm-backend/src/realm_tensor_backing copy.cc diff --git a/lib/realm-backend/include/realm-backend/allocated_tensors.h b/lib/realm-backend/include/realm-backend/allocated_tensors.h deleted file mode 100644 index 8effd06954..0000000000 --- a/lib/realm-backend/include/realm-backend/allocated_tensors.h +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_ALLOCATED_TENSORS_H -#define _FLEXFLOW_LOCAL_EXECUTION_ALLOCATED_TENSORS_H - -#include "realm-backend/allocated_tensors.dtg.h" -#include "pcg/computation_graph.h" - -namespace FlexFlow { - -bool are_allocated_forward_tensors_valid( - AllocatedTensors const &, - std::unordered_map const &); -bool are_allocated_gradient_tensors_valid( - AllocatedTensors const &, - std::unordered_map const &); -bool are_allocated_optimizer_tensors_valid( - AllocatedTensors const &, - std::unordered_map const &); - -bool are_allocated_tensors_valid( - AllocatedTensors const &, - std::unordered_map const &); - -bool is_allocated_tensor_backing_valid( - TensorTypeVariant const &, - std::unordered_map const &, - ArrayShape const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/realm-backend/include/realm-backend/allocated_tensors.struct.toml b/lib/realm-backend/include/realm-backend/allocated_tensors.struct.toml deleted file mode 100644 index d459027e5d..0000000000 --- a/lib/realm-backend/include/realm-backend/allocated_tensors.struct.toml +++ /dev/null @@ -1,32 +0,0 @@ -namespace = "FlexFlow" -name = "AllocatedTensors" -features = [ - "eq", - "fmt", - "hash", -] - -includes = [ - "task-spec/tensor_type_t.dtg.h", - "kernels/accessor.h", - "realm-backend/realm_allocator.h" -] - -src_includes = [ - "utils/hash/unordered_map.h", - "utils/fmt/unordered_map.h", - "utils/hash/vector.h", - "utils/fmt/vector.h" -] - -[[fields]] -name = "tensor_type_backings" -type = "std::unordered_map<::FlexFlow::TensorTypeVariant, std::pair<::FlexFlow::RealmRegion,::FlexFlow::TensorShape>>" - -[[fields]] -name = "gradient_mapping" -type = "std::unordered_map<::FlexFlow::tensor_guid_t, ::FlexFlow::gradient_tensor_t>" - -[[fields]] -name = "optimizer_mapping" -type = "std::unordered_map<::FlexFlow::tensor_guid_t, std::vector<::FlexFlow::optimizer_tensor_t>>" diff --git a/lib/realm-backend/include/realm-backend/model_training_instance.h b/lib/realm-backend/include/realm-backend/model_training_instance.h index 62d8311ccb..e30ae7a9a8 100644 --- a/lib/realm-backend/include/realm-backend/model_training_instance.h +++ b/lib/realm-backend/include/realm-backend/model_training_instance.h @@ -14,7 +14,7 @@ using PerLayerElapsedTime = struct ModelTrainingInstance { ModelTrainingInstance(RealmTrainingBacking const &, tensor_guid_t const &logit_tensor, - TensorShape const &label_tensor_shape, + loss_tensor_t const &label_tensor, LossAttrs const &, OptimizerAttrs const &); diff --git a/lib/realm-backend/include/realm-backend/realm_allocator.h b/lib/realm-backend/include/realm-backend/realm_allocator.h index 1e0c7b23c4..304ca38e32 100644 --- a/lib/realm-backend/include/realm-backend/realm_allocator.h +++ b/lib/realm-backend/include/realm-backend/realm_allocator.h @@ -3,56 +3,31 @@ #include "realm-backend/driver.h" #include "realm.h" +#include "kernels/allocation.h" #include namespace FlexFlow { struct RealmAllocatorImpl; -struct RealmRegion { - Realm::RegionInstance instance; - RealmAllocatorImpl *allocator; -}; - -struct RealmAllocatorImpl { +struct RealmAllocatorImpl : public IAllocator { RealmAllocatorImpl() = delete; RealmAllocatorImpl(RealmAllocatorImpl const &) = delete; RealmAllocatorImpl(RealmAllocatorImpl &&) = delete; RealmAllocatorImpl(Realm::Processor); ~RealmAllocatorImpl() = default; - RealmRegion allocate(size_t); - void deallocate(RealmRegion); + void *allocate(size_t) override; + void deallocate(void *) override; private: - std::unordered_map ptrs; + std::unordered_map ptrs; Realm::Processor proc; Realm::Memory mem; std::vector field_sizes = {sizeof(char)}; }; -struct RealmAllocator { - RealmAllocator() = delete; - - RealmRegion allocate(size_t); - void deallocate(RealmRegion); - - template - static typename std::enable_if::value, - RealmAllocator>::type - create(Args &&...args) { - return RealmAllocator(std::make_shared(std::forward(args)...)); - } - - RealmAllocator(std::shared_ptr ptr) : i_allocator(ptr) {}; - RealmAllocator(RealmAllocator const &allocator) - : i_allocator(allocator.i_allocator) {}; - -private: - std::shared_ptr i_allocator; -}; - -RealmAllocator create_realm_memory_allocator(Realm::Processor); +Allocator create_realm_memory_allocator(Realm::Processor); } // namespace FlexFlow diff --git a/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h b/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h index ce826e162e..d5c1a63b48 100644 --- a/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h +++ b/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h @@ -15,7 +15,7 @@ using TensorSlotsBacking = std::unordered_map< using ArgSlotsBacking = std::unordered_map; struct RealmTaskArgumentAccessor : public ITaskArgumentAccessor { - RealmTaskArgumentAccessor(RealmAllocator const &allocator, + RealmTaskArgumentAccessor(Allocator const &allocator, TensorSlotsBacking const &tensor_slots_backing, ArgSlotsBacking const &arg_slots_backing); @@ -35,7 +35,7 @@ struct RealmTaskArgumentAccessor : public ITaskArgumentAccessor { size_t get_device_idx() const override; private: - RealmAllocator allocator; + Allocator allocator; TensorSlotsBacking tensor_slots_backing; ArgSlotsBacking arg_slots_backing; }; diff --git a/lib/realm-backend/include/realm-backend/realm_tensor_backing.h b/lib/realm-backend/include/realm-backend/realm_tensor_backing.h index 25136ad2ff..dac93c84b0 100644 --- a/lib/realm-backend/include/realm-backend/realm_tensor_backing.h +++ b/lib/realm-backend/include/realm-backend/realm_tensor_backing.h @@ -10,21 +10,21 @@ #include "pcg/computation_graph.dtg.h" #include "pcg/layer_guid_t.dtg.h" #include "pcg/optimizer_attrs.dtg.h" -#include "realm-backend/allocated_tensors.dtg.h" +#include "local-execution/allocated_tensors.dtg.h" #include "realm-backend/realm_allocator.h" #include "realm-backend/realm_task_argument_accessor.h" -#include "realm-backend/unallocated_tensors.dtg.h" +#include "local-execution/unallocated_tensors.dtg.h" #include "task-spec/lowered_tensor_t.dtg.h" #include "task-spec/task_invocation.dtg.h" #include "task-spec/tensor_role.dtg.h" namespace FlexFlow { -using TensorBackingMap = std::unordered_map>; +using TensorBackingMap = std::unordered_map; struct RealmTensorBacking { RealmTensorBacking(AllocatedTensors const &, UnallocatedTensors const &, - RealmAllocator const &); + Allocator const &); public: GenericTensorAccessorW get_tensor(TensorTypeVariant const &) const; @@ -45,15 +45,13 @@ struct RealmTensorBacking { std::unordered_map> tensor_optimizer_mapping; - RealmAllocator allocator; + Allocator allocator; private: lowered_tensor_t insert_tensor(TensorTypeVariant const &); LoweredTensorSource lowered_tensor_source; }; -GenericTensorAccessorW wrappup_tensor_accessor(std::pair const &); - UnallocatedTensors generate_unallocated_tensors( AllocatedTensors const &, std::unordered_map const &, diff --git a/lib/realm-backend/include/realm-backend/realm_training_backing.h b/lib/realm-backend/include/realm-backend/realm_training_backing.h index 81df422b7a..45285464b8 100644 --- a/lib/realm-backend/include/realm-backend/realm_training_backing.h +++ b/lib/realm-backend/include/realm-backend/realm_training_backing.h @@ -6,7 +6,7 @@ #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" #include "pcg/computation_graph.dtg.h" #include "pcg/optimizer_attrs.dtg.h" -#include "realm-backend/allocated_tensors.dtg.h" +#include "local-execution/allocated_tensors.dtg.h" #include "realm-backend/driver.h" #include "realm-backend/realm_allocator.h" #include "realm-backend/realm_args_backing.h" @@ -19,20 +19,27 @@ using PerLayerElapsedTime = std::unordered_map>; struct RealmTrainingBacking { - RealmTrainingBacking(Realm::Processor, AllocatedTensors const &, + RealmTrainingBacking(Realm::Processor, + std::vector const &, + std::vector const &, + AllocatedTensors const &, ComputationGraph const &, RuntimeArgConfig const &); - RealmTrainingBacking(Realm::Processor, AllocatedTensors const &, + RealmTrainingBacking(Realm::Processor, + std::vector const &, + std::vector const &, + AllocatedTensors const &, ComputationGraph const &, RuntimeArgConfig const &, OptimizerAttrs const &); public: // runtime Realm::Processor master_proc; + Realm::Event master_event; Realm::Memory master_mem; std::vector worker_procs; - std::unordered_map proc_events; - std::vector allocators; + std::vector worker_events; + std::vector allocators; RealmTensorBacking realm_tensor_backing; RealmArgsBacking realm_args_backing; diff --git a/lib/realm-backend/include/realm-backend/unallocated_tensors.struct.toml b/lib/realm-backend/include/realm-backend/unallocated_tensors.struct.toml deleted file mode 100644 index e86cc2a532..0000000000 --- a/lib/realm-backend/include/realm-backend/unallocated_tensors.struct.toml +++ /dev/null @@ -1,31 +0,0 @@ -namespace = "FlexFlow" -name = "UnallocatedTensors" -features = [ - "eq", - "fmt", - "hash", -] - -includes = [ - "task-spec/tensor_type_t.dtg.h", - "op-attrs/tensor_shape.dtg.h" -] - -src_includes = [ - "utils/hash/unordered_map.h", - "utils/fmt/unordered_map.h", - "utils/hash/vector.h", - "utils/fmt/vector.h" -] - -[[fields]] -name = "tensor_type_shapes" -type = "std::unordered_map<::FlexFlow::TensorTypeVariant, ::FlexFlow::TensorShape>" - -[[fields]] -name = "gradient_mapping" -type = "std::unordered_map<::FlexFlow::tensor_guid_t, ::FlexFlow::gradient_tensor_t>" - -[[fields]] -name = "optimizer_mapping" -type = "std::unordered_map<::FlexFlow::tensor_guid_t, std::vector<::FlexFlow::optimizer_tensor_t>>" diff --git a/lib/realm-backend/src/allocated_tensors.cc b/lib/realm-backend/src/allocated_tensors.cc index f27db14643..3e249bf6d1 100644 --- a/lib/realm-backend/src/allocated_tensors.cc +++ b/lib/realm-backend/src/allocated_tensors.cc @@ -1,4 +1,4 @@ -#include "realm-backend/allocated_tensors.h" +#include "local-execution/allocated_tensors.h" #include "pcg/optimizer_attrs.h" #include "utils/containers/keys.h" #include "utils/containers/set_union.h" diff --git a/lib/realm-backend/src/realm_allocator.cc b/lib/realm-backend/src/realm_allocator.cc index fadc7f5719..d7139210bc 100644 --- a/lib/realm-backend/src/realm_allocator.cc +++ b/lib/realm-backend/src/realm_allocator.cc @@ -15,40 +15,29 @@ RealmAllocatorImpl::RealmAllocatorImpl(Processor proc) : proc(proc) { } // TODO: now the region instance only corresponds to one tensor -RealmRegion RealmAllocatorImpl::allocate(size_t requested_memory_size) { +void *RealmAllocatorImpl::allocate(size_t requested_memory_size) { Rect<1> bounds(Point<1>(0), Point<1>(requested_memory_size - 1)); RegionInstance requested_instance = RegionInstance::NO_INST; RegionInstance::create_instance(requested_instance, mem, bounds, field_sizes, /*SOA*/ 1, ProfilingRequestSet()) .wait(); void *ptr = requested_instance.pointer_untyped(0, 0); - this->ptrs.insert({requested_instance, ptr}); - return {requested_instance, this}; + this->ptrs.insert({ptr, requested_instance}); + return ptr; } -void RealmAllocatorImpl::deallocate(RealmRegion region) { - if (region.allocator == this and contains_key(this->ptrs, region.instance)) { - RegionInstance instance = this->ptrs.at(region.instance); - instance.destroy(); +void RealmAllocatorImpl::deallocate(void *ptr) { + if (this->ptrs.count(ptr)) { + RegionInstance region = this->ptrs.at(ptr); + region.destroy(); } else { throw std::runtime_error( "Deallocating a pointer that was not allocated by this Allocator"); } } - -/*********** RealmAllocator ***********/ - -RealmRegion RealmAllocator::allocate(size_t mem_size) { - return this->i_allocator->allocate(mem_size); -} - -void RealmAllocator::deallocate(RealmRegion region) { - this->i_allocator->deallocate(region); -} - -RealmAllocator create_realm_memory_allocator(Processor proc) { - return RealmAllocator::create(proc); +Allocator create_realm_memory_allocator(Processor proc) { + return Allocator::create(proc); } } // namespace FlexFlow diff --git a/lib/realm-backend/src/realm_tensor_backing copy.cc b/lib/realm-backend/src/realm_tensor_backing copy.cc deleted file mode 100644 index bac16c6b69..0000000000 --- a/lib/realm-backend/src/realm_tensor_backing copy.cc +++ /dev/null @@ -1,142 +0,0 @@ -#include "task-spec/slot_grad_id.dtg.h" - -#include "op-attrs/parallel_tensor_shape.h" -#include "pcg/computation_graph.h" -#include "pcg/optimizer_attrs.h" -#include "realm-backend/realm_allocator.h" -#include "realm-backend/realm_tensor_backing.h" -#include "utils/containers/contains_key.h" -#include "utils/containers/keys.h" -#include "utils/overload.h" - -namespace FlexFlow { - -RealmTensorBacking::RealmTensorBacking() {}; - -void RealmTensorBacking::allocate_layer_tensors( - layer_guid_t const &layer_guid, ComputationGraph const &computation_graph, - RealmAllocator &allocator) { - this->allocate_tensors_by_role(TensorRole::INPUT, layer_guid, - computation_graph, allocator); - this->allocate_tensors_by_role(TensorRole::WEIGHT, layer_guid, - computation_graph, allocator); - this->allocate_tensors_by_role(TensorRole::OUTPUT, layer_guid, - computation_graph, allocator); -} - -void RealmTensorBacking::allocate_tensors_by_role( - TensorRole const &role, layer_guid_t const &layer_guid, - ComputationGraph const &computation_graph, RealmAllocator &allocator) { - std::vector tensors; - switch (role) { - case TensorRole::INPUT: - tensors = get_incoming_inputs(computation_graph, layer_guid); - break; - case TensorRole::WEIGHT: - tensors = get_incoming_weights(computation_graph, layer_guid); - break; - case TensorRole::OUTPUT: - tensors = get_outgoing_tensors(computation_graph, layer_guid); - break; - default: - throw mk_runtime_error("Invalid tensor role, got {}", role); - } - - for (tensor_guid_t const &tensor : tensors) { - TensorAttrs tensor_attrs = get_tensor_attrs(computation_graph, tensor); - // tensor allocation - if (!contains_key(this->tensor_lowering_mapping, tensor)) { - lowered_tensor_t reduced_tensor = - this->lowered_tensor_source.new_lowered_tensor(); - this->tensor_lowering_mapping.insert({tensor, reduced_tensor}); - RealmRegion region = - allocator.allocate(get_size_in_bytes(tensor_attrs.shape)); - this->tensor_regions.insert({reduced_tensor, region}); - this->tensor_shapes.insert({reduced_tensor, tensor_attrs.shape}); - } - - // gradient tensor allocation - if (tensor_attrs.create_gradients == CreateGrad::YES && - !contains_key(this->gradient_tensor_lowering_mapping, tensor)) { - lowered_tensor_t reduced_tensor = - this->lowered_tensor_source.new_lowered_tensor(); - this->gradient_tensor_lowering_mapping.insert({tensor, reduced_tensor}); - RealmRegion region = - allocator.allocate(get_size_in_bytes(tensor_attrs.shape)); - this->tensor_regions.insert({reduced_tensor, region}); - this->tensor_shapes.insert({reduced_tensor, tensor_attrs.shape}); - } - } -} - -void RealmTensorBacking::allocate_optimizer_tensors( - tensor_guid_t const &weight, - std::vector const &optimizer_tensors, - RealmAllocator &allocator) { - GenericTensorAccessorW weight_backing = - this->get_tensor_backing(this->tensor_lowering_mapping.at(weight)); - for (optimizer_tensor_t const &optimizer_tensor : optimizer_tensors) { - // optimizer tensor allocation - if (!contains_key(this->optimizer_tensor_lowering_mapping, - optimizer_tensor)) { - lowered_tensor_t buffer_tensor = - this->lowered_tensor_source.new_lowered_tensor(); - this->optimizer_tensor_lowering_mapping.insert( - {optimizer_tensor, buffer_tensor}); - TensorShape tensor_shape = - get_tensor_shape(weight_backing.shape, weight_backing.data_type); - RealmRegion region = allocator.allocate(get_size_in_bytes(tensor_shape)); - this->tensor_regions.insert({buffer_tensor, region}); - this->tensor_shapes.insert({buffer_tensor, tensor_shape}); - } - } -} - -bool RealmTensorBacking::is_tensor_allocated( - lowered_tensor_t const &tensor_id) const { - return contains_key(tensor_regions, tensor_id); -} - -GenericTensorAccessorW const &RealmTensorBacking::get_tensor_backing( - lowered_tensor_t const &tensor_id) const { - void *ptr = this->tensor_regions.at(tensor_id).instance.pointer_untyped(0, 0); - TensorShape shape = this->tensor_shapes.at(tensor_id); - return {shape.data_type, ArrayShape{shape}, ptr}; -} - -TensorSlotsBacking RealmTensorBacking::construct_tensor_slots_backing( - TaskBinding const &binding) const { - TensorSlotsBacking mapping; - - for (auto const &tensor_binding : binding.get_tensor_bindings()) { - SlotTensorTypeId slot_tensor_type_id = tensor_binding.first; - - lowered_tensor_t tensor_id = [&] { - TensorTypeVariant tensor_type = tensor_binding.second; - if (tensor_type.has() and - slot_tensor_type_id.tensor_type == TensorType::FORWARD) { - return this->tensor_lowering_mapping.at( - tensor_type.get()); - } else if (tensor_type.has() and - slot_tensor_type_id.tensor_type == TensorType::GRADIENT) { - return this->gradient_tensor_lowering_mapping.at( - tensor_type.get()); - } else if (tensor_type.has()) { - return this->optimizer_tensor_lowering_mapping.at( - tensor_type.get()); - } else if (tensor_type.has()) { - return this->loss_tensor_lowering_mapping.at( - tensor_type.get()); - } else { - throw mk_runtime_error(fmt::format("Tensor binding has invalid type")); - } - }(); - - GenericTensorAccessorW accessor = this->get_tensor_backing(tensor_id); - mapping.insert({slot_tensor_type_id, accessor}); - } - - return mapping; -} - -} // namespace FlexFlow diff --git a/lib/realm-backend/src/realm_tensor_backing.cc b/lib/realm-backend/src/realm_tensor_backing.cc index 8f8f828821..12d0973fba 100644 --- a/lib/realm-backend/src/realm_tensor_backing.cc +++ b/lib/realm-backend/src/realm_tensor_backing.cc @@ -2,7 +2,7 @@ #include "op-attrs/tensor_shape.h" #include "pcg/computation_graph.h" #include "pcg/optimizer_attrs.h" -#include "realm-backend/allocated_tensors.h" +#include "local-execution/allocated_tensors.h" #include "realm-backend/realm_allocator.h" #include "realm-backend/realm_tensor_backing.h" #include "task-spec/slot_grad_id.dtg.h" @@ -12,23 +12,16 @@ namespace FlexFlow { -GenericTensorAccessorW wrappup_tensor_accessor( - std::pair const &tensor_region_shape) { - void *ptr = tensor_region_shape.first.instance.pointer_untyped(0, 0); - TensorShape shape = tensor_region_shape.second; - return {shape.data_type, ArrayShape{shape}, ptr}; -} - RealmTensorBacking::RealmTensorBacking( AllocatedTensors const &allocated_tensors, UnallocatedTensors const &unallocated_tensors, - RealmAllocator const &allocator) + Allocator const &allocator) : tensor_gradient_mapping(allocated_tensors.gradient_mapping), tensor_optimizer_mapping(allocated_tensors.optimizer_mapping), allocator(allocator) { // handle already-allocated tensors - for (std::pair> const + for (std::pair const &tensor_type_backing : allocated_tensors.tensor_type_backings) { lowered_tensor_t lowered_tensor = this->insert_tensor(tensor_type_backing.first); @@ -59,10 +52,9 @@ RealmTensorBacking::RealmTensorBacking( unallocated_tensors.tensor_type_shapes) { lowered_tensor_t lowered_tensor = this->insert_tensor(tensor_type_shape.first); - RealmRegion region = allocator.allocate( - get_size_in_bytes(tensor_type_shape.second).unwrap_nonnegative()); - this->tensor_backings.insert( - {lowered_tensor, {region, tensor_type_shape.second}}); + GenericTensorAccessorW tensor_backing = + this->allocator.allocate_tensor(tensor_type_shape.second); + this->tensor_backings.insert({lowered_tensor, tensor_backing}); } }; @@ -117,7 +109,7 @@ RealmTensorBacking::get_tensor(TensorTypeVariant const &tensor_type) const { throw mk_runtime_error( fmt::format("Unhandled tensor type {}", any_tensor)); }}); - return wrappup_tensor_accessor(this->tensor_backings.at(lowered_tensor)); + return this->tensor_backings.at(lowered_tensor); } UnallocatedTensors generate_unallocated_tensors( diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc index f6b516e303..225a376cf3 100644 --- a/lib/realm-backend/src/realm_training_backing.cc +++ b/lib/realm-backend/src/realm_training_backing.cc @@ -23,26 +23,34 @@ namespace FlexFlow { using namespace Realm; RealmTrainingBacking::RealmTrainingBacking( - Processor master_proc, AllocatedTensors const &allocated_tensors, + Processor master_proc, std::vector const &worker_procs, + std::vector const &allocators, + AllocatedTensors const &allocated_tensors, ComputationGraph const &computation_graph, RuntimeArgConfig const &runtime_arg_config) - : computation_graph(computation_graph), + : master_proc(master_proc), worker_procs(worker_procs), + allocators(allocators), computation_graph(computation_graph), task_registry(construct_task_registry( - get_layer_attrs_mapping(this->computation_graph)))) { - master_proc = master_proc; - proc_events.insert({master_proc, Realm::Event::NO_EVENT}); + get_layer_attrs_mapping(this->computation_graph))), + realm_tensor_backing(RealmTensorBacking( // TODO: multi gpu + allocated_tensors, + generate_unallocated_tensors( + allocated_tensors, get_all_tensor_attrs(this->computation_graph), + this->gradient_tensor_source), + this->allocators[0])), + realm_args_backing(initialize_args_backing(this, runtime_arg_config)) { + master_event = Realm::Event::NO_EVENT; master_mem = Machine::MemoryQuery(Machine::get_machine()) .only_kind(Memory::SYSTEM_MEM) .best_affinity_to(master_proc) .first(); - Machine::ProcessorQuery pq = Machine::ProcessorQuery(Machine::get_machine()) - .only_kind(Processor::TOC_PROC); - for (Processor p : pq) { - worker_procs.push_back(p); - proc_events.insert({p, Realm::Event::NO_EVENT}); - allocators.push_back(RealmAllocator::create(p)); + for (Processor p : worker_procs) { + worker_events.push_back(Realm::Event::NO_EVENT); } - assert(worker_procs.size() > 0); + // Machine::ProcessorQuery pq = + // Machine::ProcessorQuery(Machine::get_machine()) + // .only_kind(Processor::TOC_PROC); + // allocators.push_back(create_realm_memory_allocator(p)); // register tasks for realm for (layer_guid_t const &node : @@ -60,41 +68,35 @@ RealmTrainingBacking::RealmTrainingBacking( } } } - - // TODO: multi gpu - realm_tensor_backing = RealmTensorBacking( - allocated_tensors, - generate_unallocated_tensors( - allocated_tensors, get_all_tensor_attrs(this->computation_graph), - this->gradient_tensor_source), - allocators[0]); - realm_args_backing = - initialize_args_backing(this->task_registry, this->computation_graph, - runtime_arg_config, this->realm_tensor_backing); } RealmTrainingBacking::RealmTrainingBacking( - Processor master_proc, AllocatedTensors const &allocated_tensors, - ComputationGraph const &computation_graph, - RuntimeArgConfig const &runtime_arg_config, - OptimizerAttrs const &optimizer_attrs) - : computation_graph(computation_graph), - task_registry(construct_task_registry( - get_layer_attrs_mapping(this->computation_graph)))) { - master_proc = master_proc; - proc_events.insert({master_proc, Realm::Event::NO_EVENT}); + Processor master_proc, std::vector const &worker_procs, + std::vector const &allocators, + AllocatedTensors const &allocated_tensors, + ComputationGraph const &computation_graph, + RuntimeArgConfig const &runtime_arg_config, + OptimizerAttrs const &optimizer_attrs) + : master_proc(master_proc), worker_procs(worker_procs), + allocators(allocators), computation_graph(computation_graph), + task_registry(construct_task_registry( + get_layer_attrs_mapping(this->computation_graph))), + realm_tensor_backing(RealmTensorBacking( // TODO: multi gpu + allocated_tensors, + generate_unallocated_tensors_with_optimizer( + allocated_tensors, get_all_tensor_attrs(this->computation_graph), + this->gradient_tensor_source, this->optimizer_tensor_source, + optimizer_attrs), + this->allocators[0])), + realm_args_backing(initialize_args_backing(this, runtime_arg_config)) { + master_event = Realm::Event::NO_EVENT; master_mem = Machine::MemoryQuery(Machine::get_machine()) .only_kind(Memory::SYSTEM_MEM) .best_affinity_to(master_proc) .first(); - Machine::ProcessorQuery pq = Machine::ProcessorQuery(Machine::get_machine()) - .only_kind(Processor::TOC_PROC); - for (Processor p : pq) { - worker_procs.push_back(p); - proc_events.insert({p, Realm::Event::NO_EVENT}); - allocators.push_back(RealmAllocator::create(p)); + for (Processor p : worker_procs) { + worker_events.push_back(Realm::Event::NO_EVENT); } - assert(worker_procs.size() > 0); // register tasks for realm for (layer_guid_t const &node : @@ -112,16 +114,6 @@ RealmTrainingBacking::RealmTrainingBacking( } } } - - // TODO: multi gpu - realm_tensor_backing = RealmTensorBacking( - allocated_tensors, - generate_unallocated_tensors_with_optimizer( - allocated_tensors, get_all_tensor_attrs(this->computation_graph), - this->gradient_tensor_source, this->optimizer_tensor_source, - optimizer_attrs), - allocators[0]); - realm_args_backing = initialize_args_backing(this, runtime_arg_config); } RealmArgsBacking @@ -140,7 +132,7 @@ initialize_args_backing(RealmTrainingBacking *backing, Processor master_proc = backing->master_proc; Memory master_mem = backing->master_mem; std::vector &worker_procs = backing->worker_procs; - std::unordered_map &proc_events = backing->proc_events; + std::vector &worker_events = backing->worker_events; for (layer_guid_t const &node : topological_ordering(cg)) { if (registry_contains_task_for_layer(task_registry, node, @@ -164,10 +156,10 @@ initialize_args_backing(RealmTrainingBacking *backing, Future future = promise.get_future(); RealmTaskArgs args{ task_id, impl_function, accessor, std::move(promise)}; - Event e = worker_procs[0].spawn( - static_cast(task_id), &args, sizeof(args), - proc_events[worker_procs[0]]); - proc_events[worker_procs[0]] = e; + Event e = + worker_procs[0].spawn(static_cast(task_id), + &args, sizeof(args), worker_events[0]); + worker_events[0] = e; future.set_event(e); per_device_op_states.insert({node, std::move(future.get())}); } @@ -176,35 +168,6 @@ initialize_args_backing(RealmTrainingBacking *backing, return RealmArgsBacking{runtime_arg_config, per_device_op_states}; } -// void RealmTrainingBacking::register_and_allocate_layer( -// layer_guid_t const &node) { -// ComputationGraphOpAttrs attrs = -// get_layer_attrs(this->computation_graph, node).attrs; -// this->realm_tensor_backing.allocate_layer_tensors( -// node, this->computation_graph, this->allocators[0]); -// } - -// void RealmTrainingBacking::allocate_layer_optimizer_tensors( -// layer_guid_t const &node, OptimizerAttrs const &optimizer_attrs) { -// ComputationGraphOpAttrs attrs = -// get_layer_attrs(this->computation_graph, node).attrs; -// if (attrs.has()) { -// TaskSignature sig = get_update_signature(optimizer_attrs); -// tensor_guid_t weight_tensor = -// get_only(get_outgoing_tensors(this->computation_graph, node)); - -// std::vector optimizer_tensors; -// for (TensorTypeSlotSpec const &tensor_type_slot_spec : -// values(sig.tensor_guid_slots)) { -// optimizer_tensors.push_back( -// this->optimizer_tensor_source.new_optimizer_tensor()); -// } -// this->layer_optimizer_tensor_ids.insert({node, optimizer_tensors}); -// this->realm_tensor_backing.allocate_optimizer_tensors( -// weight_tensor, optimizer_tensors, this->allocators[0]); -// } -// } - Future> execute_forward(RealmTrainingBacking &realm_training_backing, layer_guid_t const &operator_node) { @@ -242,10 +205,8 @@ execute_forward(RealmTrainingBacking &realm_training_backing, std::move(promise)}; Event e = realm_training_backing.worker_procs[0].spawn( static_cast(task_id), &args, sizeof(args), - realm_training_backing - .proc_events[realm_training_backing.worker_procs[0]]); - realm_training_backing.proc_events[realm_training_backing.worker_procs[0]] = - e; + realm_training_backing.worker_events[0]); + realm_training_backing.worker_events[0] = e; future.set_event(e); return future; } else { @@ -290,10 +251,8 @@ execute_backward(RealmTrainingBacking &realm_training_backing, std::move(promise)}; Event e = realm_training_backing.worker_procs[0].spawn( static_cast(task_id), &args, sizeof(args), - realm_training_backing - .proc_events[realm_training_backing.worker_procs[0]]); - realm_training_backing.proc_events[realm_training_backing.worker_procs[0]] = - e; + realm_training_backing.worker_events[0]); + realm_training_backing.worker_events[0] = e; future.set_event(e); return future; } else { @@ -301,7 +260,7 @@ execute_backward(RealmTrainingBacking &realm_training_backing, } } -Future execute_update(RealmTrainingBacking const &realm_training_backing, +Future execute_update(RealmTrainingBacking &realm_training_backing, layer_guid_t const &node, OptimizerAttrs const &optimizer_attrs) { LayerAttrs layer_attrs = @@ -341,10 +300,8 @@ Future execute_update(RealmTrainingBacking const &realm_training_backing, std::move(promise)}; Event e = realm_training_backing.worker_procs[0].spawn( static_cast(task_id), &args, sizeof(args), - realm_training_backing - .proc_events[realm_training_backing.worker_procs[0]]); - realm_training_backing.proc_events[realm_training_backing.worker_procs[0]] = - e; + realm_training_backing.worker_events[0]); + realm_training_backing.worker_events[0] = e; future.set_event(e); return future; } else { @@ -352,7 +309,7 @@ Future execute_update(RealmTrainingBacking const &realm_training_backing, } } -Future compute_loss(RealmTrainingBacking const &realm_training_backing, +Future compute_loss(RealmTrainingBacking &realm_training_backing, LossAttrs const &loss_attrs, tensor_guid_t const &logit_tensor, loss_tensor_t const &label_tensor) { @@ -377,10 +334,8 @@ Future compute_loss(RealmTrainingBacking const &realm_training_backing, std::move(promise)}; Event e = realm_training_backing.worker_procs[0].spawn( static_cast(task_id), &args, sizeof(args), - realm_training_backing - .proc_events[realm_training_backing.worker_procs[0]]); - realm_training_backing.proc_events[realm_training_backing.worker_procs[0]] = - e; + realm_training_backing.worker_events[0]); + realm_training_backing.worker_events[0] = e; future.set_event(e); return future; } From 419cca873751ed93f9ba0887f87fa5798cad4539 Mon Sep 17 00:00:00 2001 From: fruitea Date: Mon, 3 Mar 2025 08:16:03 -0800 Subject: [PATCH 54/91] chore: eliminate std::optional --- .../include/realm-backend/realm_training_backing.h | 8 ++++---- lib/realm-backend/src/model_training_instance.cc | 8 ++++---- lib/realm-backend/src/task_wrapper.cc | 6 +++--- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/lib/realm-backend/include/realm-backend/realm_training_backing.h b/lib/realm-backend/include/realm-backend/realm_training_backing.h index 45285464b8..1b756b14d3 100644 --- a/lib/realm-backend/include/realm-backend/realm_training_backing.h +++ b/lib/realm-backend/include/realm-backend/realm_training_backing.h @@ -55,10 +55,10 @@ RealmArgsBacking initialize_args_backing(RealmTrainingBacking *, RuntimeArgConfig const &); void execute_init(RealmTrainingBacking &, layer_guid_t const &); -Future> execute_forward(RealmTrainingBacking &, - layer_guid_t const &); -Future> execute_backward(RealmTrainingBacking &, - layer_guid_t const &); +Future execute_forward(RealmTrainingBacking &, + layer_guid_t const &); +Future execute_backward(RealmTrainingBacking &, + layer_guid_t const &); Future compute_loss(RealmTrainingBacking &, LossAttrs const &, tensor_guid_t const &logit_tensor, loss_tensor_t const &label_tensor); diff --git a/lib/realm-backend/src/model_training_instance.cc b/lib/realm-backend/src/model_training_instance.cc index acb8edb314..aa8c30b34f 100644 --- a/lib/realm-backend/src/model_training_instance.cc +++ b/lib/realm-backend/src/model_training_instance.cc @@ -17,7 +17,7 @@ namespace FlexFlow { PerLayerElapsedTime forward(ModelTrainingInstance &model_training_instance) { PerLayerElapsedTime per_layer_elapsed_time; - std::unordered_map>> + std::unordered_map> per_layer_elapsed_time_future; for (layer_guid_t const &node : topological_ordering( model_training_instance.training_backing.computation_graph)) { @@ -26,7 +26,7 @@ PerLayerElapsedTime forward(ModelTrainingInstance &model_training_instance) { } for (layer_guid_t const &node : topological_ordering( model_training_instance.training_backing.computation_graph)) { - std::optional elapsed_time = + float elapsed_time = per_layer_elapsed_time_future[node].get(); per_layer_elapsed_time.insert({node, elapsed_time}); } @@ -40,7 +40,7 @@ PerLayerElapsedTime backward(ModelTrainingInstance &model_training_instance) { model_training_instance.label_tensor); PerLayerElapsedTime per_layer_elapsed_time; - std::unordered_map>> + std::unordered_map> per_layer_elapsed_time_future; for (layer_guid_t const &node : reversed(topological_ordering( model_training_instance.training_backing.computation_graph))) { @@ -49,7 +49,7 @@ PerLayerElapsedTime backward(ModelTrainingInstance &model_training_instance) { } for (layer_guid_t const &node : reversed(topological_ordering( model_training_instance.training_backing.computation_graph))) { - std::optional elapsed_time = + float elapsed_time = per_layer_elapsed_time_future[node].get(); per_layer_elapsed_time.insert({node, elapsed_time}); } diff --git a/lib/realm-backend/src/task_wrapper.cc b/lib/realm-backend/src/task_wrapper.cc index e58d2611af..ea36275462 100644 --- a/lib/realm-backend/src/task_wrapper.cc +++ b/lib/realm-backend/src/task_wrapper.cc @@ -16,12 +16,12 @@ void init_wrapper_task(const void *args, size_t arglen, const void *userdata, void fwdbwd_wrapper_task(const void *args, size_t arglen, const void *userdata, size_t userlen, Processor p) { - RealmTaskArgs> const &task_args = - *reinterpret_cast> *>(args); + RealmTaskArgs const &task_args = + *reinterpret_cast *>(args); auto fn = task_args.impl_function.get().function_ptr; std::optional result = fn(task_args.accessor); - task_args.promise.set_value(std::move(result)); + task_args.promise.set_value(result.has_value() ? result.value() : 0.0f); } void generic_wrapper_task(const void *args, size_t arglen, const void *userdata, From 2c0b5738e13c2671fa4b028c154cc5545f799220 Mon Sep 17 00:00:00 2001 From: fruitea Date: Tue, 4 Mar 2025 22:49:40 -0800 Subject: [PATCH 55/91] feat: buildable realm-backend --- .../include/realm-backend/task_result.h | 8 ++- .../src/realm_training_backing.cc | 62 +++++++++---------- 2 files changed, 33 insertions(+), 37 deletions(-) diff --git a/lib/realm-backend/include/realm-backend/task_result.h b/lib/realm-backend/include/realm-backend/task_result.h index 4cf8916f85..bac20ddd14 100644 --- a/lib/realm-backend/include/realm-backend/task_result.h +++ b/lib/realm-backend/include/realm-backend/task_result.h @@ -47,17 +47,18 @@ template class Future { public: explicit Future(std::shared_ptr> state) : state_(std::move(state)) {} + explicit Future() = default; explicit Future(T value) : value_(std::move(value)) {} void set_event(Realm::Event e) { state_->set_event(e); } T get() { - value_ = state_->get_value(); - return value_; + value_ = std::make_optional(state_->get_value()); + return value_.value(); } void wait() { state_->wait(); } private: std::shared_ptr> state_; - T value_; + std::optional value_ = std::nullopt; }; // Specialization of Future for the `void` type, as it does not carry a value. @@ -67,6 +68,7 @@ template <> class Future { : state_(std::move(state)) {} explicit Future() = default; void set_event(Realm::Event e) { state_->set_event(e); } + void get() { state_->wait(); } void wait() { state_->wait(); } private: diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc index 225a376cf3..d0b985921e 100644 --- a/lib/realm-backend/src/realm_training_backing.cc +++ b/lib/realm-backend/src/realm_training_backing.cc @@ -53,19 +53,16 @@ RealmTrainingBacking::RealmTrainingBacking( // allocators.push_back(create_realm_memory_allocator(p)); // register tasks for realm - for (layer_guid_t const &node : - topological_ordering(this->computation_graph)) { - ComputationGraphOpAttrs attrs = - get_layer_attrs(this->computation_graph, node).attrs; - if (attrs.has()) { - OpTaskInvocation op_task_invocation = attrs.get(); - std::vector task_ids = get_task_ids(attrs); - for (task_id_t task_id : task_ids) { - TaskSignatureAndImpl task_signature_impl = - this->task_registry.task_mapping.at(task_id); + std::unordered_map const &layer_attrs_mapping = + get_layer_attrs_mapping(this->computation_graph); + for (std::pair const &layer_attrs : + layer_attrs_mapping) { + ComputationGraphOpAttrs attrs = layer_attrs.second.attrs; + std::vector task_ids = get_task_ids(attrs); + for (task_id_t task_id : task_ids) { + TaskSignatureAndImpl task_signature_impl = get_task_sig_impl(task_id); // TODO: multi gpu register_wrapper_tasks(worker_procs[0], task_id, task_signature_impl); - } } } } @@ -99,19 +96,16 @@ RealmTrainingBacking::RealmTrainingBacking( } // register tasks for realm - for (layer_guid_t const &node : - topological_ordering(this->computation_graph)) { - ComputationGraphOpAttrs attrs = - get_layer_attrs(this->computation_graph, node).attrs; - if (attrs.has()) { - OpTaskInvocation op_task_invocation = attrs.get(); - std::vector task_ids = get_task_ids(attrs); - for (task_id_t task_id : task_ids) { - TaskSignatureAndImpl task_signature_impl = - this->task_registry.task_mapping.at(task_id); + std::unordered_map const &layer_attrs_mapping = + get_layer_attrs_mapping(this->computation_graph); + for (std::pair const &layer_attrs : + layer_attrs_mapping) { + ComputationGraphOpAttrs attrs = layer_attrs.second.attrs; + std::vector task_ids = get_task_ids(attrs); + for (task_id_t task_id : task_ids) { + TaskSignatureAndImpl task_signature_impl = get_task_sig_impl(task_id); // TODO: multi gpu register_wrapper_tasks(worker_procs[0], task_id, task_signature_impl); - } } } } @@ -168,7 +162,7 @@ initialize_args_backing(RealmTrainingBacking *backing, return RealmArgsBacking{runtime_arg_config, per_device_op_states}; } -Future> +Future execute_forward(RealmTrainingBacking &realm_training_backing, layer_guid_t const &operator_node) { if (registry_contains_task_for_layer(realm_training_backing.task_registry, @@ -199,10 +193,10 @@ execute_forward(RealmTrainingBacking &realm_training_backing, realm_training_backing.task_registry.task_mapping.at(task_id) .impl_function; // TODO: multi gpu launching - Promise> promise(realm_training_backing.master_mem); - Future> future = promise.get_future(); - RealmTaskArgs> args{task_id, impl_function, accessor, - std::move(promise)}; + Promise promise(realm_training_backing.master_mem); + Future future = promise.get_future(); + RealmTaskArgs args{task_id, impl_function, accessor, + std::move(promise)}; Event e = realm_training_backing.worker_procs[0].spawn( static_cast(task_id), &args, sizeof(args), realm_training_backing.worker_events[0]); @@ -210,11 +204,11 @@ execute_forward(RealmTrainingBacking &realm_training_backing, future.set_event(e); return future; } else { - return Future>(std::nullopt); + return Future(0.0f); } } -Future> +Future execute_backward(RealmTrainingBacking &realm_training_backing, layer_guid_t const &operator_node) { if (registry_contains_task_for_layer(realm_training_backing.task_registry, @@ -245,10 +239,10 @@ execute_backward(RealmTrainingBacking &realm_training_backing, realm_training_backing.task_registry.task_mapping.at(task_id) .impl_function; // TODO: multi gpu launching - Promise> promise(realm_training_backing.master_mem); - Future> future = promise.get_future(); - RealmTaskArgs> args{task_id, impl_function, accessor, - std::move(promise)}; + Promise promise(realm_training_backing.master_mem); + Future future = promise.get_future(); + RealmTaskArgs args{task_id, impl_function, accessor, + std::move(promise)}; Event e = realm_training_backing.worker_procs[0].spawn( static_cast(task_id), &args, sizeof(args), realm_training_backing.worker_events[0]); @@ -256,7 +250,7 @@ execute_backward(RealmTrainingBacking &realm_training_backing, future.set_event(e); return future; } else { - return Future>(std::nullopt); + return Future(0.0f); } } From 062825e7fd04b561b84a36374a2c8df24ef220dc Mon Sep 17 00:00:00 2001 From: fruitea Date: Wed, 5 Mar 2025 00:47:14 -0800 Subject: [PATCH 56/91] chore: Move realm tensor backing to dtgen --- .../realm-backend/model_training_instance.h | 9 +- .../realm_task_argument_accessor.h | 9 - .../realm-backend/realm_tensor_backing.h | 79 +++--- .../realm_tensor_backing.struct.toml | 34 +++ .../realm-backend/realm_training_backing.h | 6 +- .../src/model_training_instance.cc | 38 +-- .../src/realm_task_argument_accessor.cc | 31 --- lib/realm-backend/src/realm_tensor_backing.cc | 227 ++++-------------- .../src/realm_training_backing.cc | 49 ++-- 9 files changed, 172 insertions(+), 310 deletions(-) create mode 100644 lib/realm-backend/include/realm-backend/realm_tensor_backing.struct.toml diff --git a/lib/realm-backend/include/realm-backend/model_training_instance.h b/lib/realm-backend/include/realm-backend/model_training_instance.h index e30ae7a9a8..6c92b1de4a 100644 --- a/lib/realm-backend/include/realm-backend/model_training_instance.h +++ b/lib/realm-backend/include/realm-backend/model_training_instance.h @@ -23,11 +23,12 @@ struct ModelTrainingInstance { loss_tensor_t label_tensor; LossAttrs loss_attrs; OptimizerAttrs optimizer_attrs; -}; -PerLayerElapsedTime forward(ModelTrainingInstance &); -PerLayerElapsedTime backward(ModelTrainingInstance &); -void update(ModelTrainingInstance &); +public: + PerLayerElapsedTime forward(); + PerLayerElapsedTime backward(); + void update(); +}; } // namespace FlexFlow diff --git a/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h b/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h index d5c1a63b48..256e69c301 100644 --- a/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h +++ b/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h @@ -40,15 +40,6 @@ struct RealmTaskArgumentAccessor : public ITaskArgumentAccessor { ArgSlotsBacking arg_slots_backing; }; -using TensorSlotsBackingWithoutAddresses = std::unordered_map< - SlotTensorTypeId, - std::variant, - std::vector>>>; - -TensorSlotsBackingWithoutAddresses -get_slots_backing_without_tensor_allocation_addresses( - TensorSlotsBacking const &); - CHECK_RC_COPY_VIRTUAL_COMPLIANT(RealmTaskArgumentAccessor); } // namespace FlexFlow diff --git a/lib/realm-backend/include/realm-backend/realm_tensor_backing.h b/lib/realm-backend/include/realm-backend/realm_tensor_backing.h index dac93c84b0..b38815ffee 100644 --- a/lib/realm-backend/include/realm-backend/realm_tensor_backing.h +++ b/lib/realm-backend/include/realm-backend/realm_tensor_backing.h @@ -3,68 +3,45 @@ #define _FLEXFLOW_REALM_BACKEND_REALM_TENSOR_BACKING_H #include "kernels/accessor.h" +#include "local-execution/allocated_tensors.dtg.h" #include "local-execution/gradient_tensor_source.h" #include "local-execution/loss_tensor_source.h" -#include "local-execution/lowered_tensor_source.h" #include "local-execution/optimizer_tensor_source.h" +#include "local-execution/unallocated_tensors.dtg.h" #include "pcg/computation_graph.dtg.h" #include "pcg/layer_guid_t.dtg.h" #include "pcg/optimizer_attrs.dtg.h" -#include "local-execution/allocated_tensors.dtg.h" #include "realm-backend/realm_allocator.h" #include "realm-backend/realm_task_argument_accessor.h" -#include "local-execution/unallocated_tensors.dtg.h" +#include "realm-backend/realm_tensor_backing.dtg.h" #include "task-spec/lowered_tensor_t.dtg.h" #include "task-spec/task_invocation.dtg.h" #include "task-spec/tensor_role.dtg.h" - namespace FlexFlow { -using TensorBackingMap = std::unordered_map; - -struct RealmTensorBacking { - RealmTensorBacking(AllocatedTensors const &, UnallocatedTensors const &, - Allocator const &); - -public: - GenericTensorAccessorW get_tensor(TensorTypeVariant const &) const; - -public: - // tensors - TensorBackingMap tensor_backings; - - std::unordered_map tensor_lowering_mapping; - std::unordered_map - gradient_tensor_lowering_mapping; - std::unordered_map - optimizer_tensor_lowering_mapping; - std::unordered_map - loss_tensor_lowering_mapping; - - std::unordered_map tensor_gradient_mapping; + GenericTensorAccessorW get_tensor(RealmTensorBacking const &, + TensorTypeVariant const &); + + std::unordered_map + get_tensor_backings( + std::unordered_map const &, + std::unordered_map const &, + Allocator &); + std::unordered_map> - tensor_optimizer_mapping; - - Allocator allocator; - -private: - lowered_tensor_t insert_tensor(TensorTypeVariant const &); - LoweredTensorSource lowered_tensor_source; -}; - -UnallocatedTensors generate_unallocated_tensors( - AllocatedTensors const &, - std::unordered_map const &, - GradientTensorSource &); - -UnallocatedTensors generate_unallocated_tensors_with_optimizer( - AllocatedTensors const &, - std::unordered_map const &, - GradientTensorSource &, OptimizerTensorSource &, OptimizerAttrs const &); - -TensorSlotsBacking construct_tensor_slots_backing(RealmTensorBacking const &, - TaskBinding const &); - -} // namespace FlexFlow - -#endif + merge_optimizer_mappings( + std::unordered_map> const + &allocated, + std::unordered_map> const + &unallocated); + + RealmTensorBacking construct_realm_tensor_backing(AllocatedTensors const &, + UnallocatedTensors const &, + Allocator &); + + TensorSlotsBacking construct_tensor_slots_backing(RealmTensorBacking const &, + TaskBinding const &); + + } // namespace FlexFlow + + #endif \ No newline at end of file diff --git a/lib/realm-backend/include/realm-backend/realm_tensor_backing.struct.toml b/lib/realm-backend/include/realm-backend/realm_tensor_backing.struct.toml new file mode 100644 index 0000000000..92a074e4fc --- /dev/null +++ b/lib/realm-backend/include/realm-backend/realm_tensor_backing.struct.toml @@ -0,0 +1,34 @@ +namespace = "FlexFlow" +name = "RealmTensorBacking" +features = [ + "eq", + "fmt", + "hash" +] + +includes = [ + "task-spec/tensor_type_t.dtg.h", + "kernels/accessor.h", + "pcg/tensor_guid_t.dtg.h", + "task-spec/gradient_tensor_t.dtg.h", + "task-spec/optimizer_tensor_t.dtg.h", +] + +src_includes = [ + "utils/hash/unordered_map.h", + "utils/fmt/unordered_map.h", + "utils/hash/vector.h", + "utils/fmt/vector.h", +] + +[[fields]] +name = "tensor_backings" +type = "std::unordered_map<::FlexFlow::TensorTypeVariant, ::FlexFlow::GenericTensorAccessorW>" + +[[fields]] +name = "tensor_gradient_mapping" +type = "std::unordered_map<::FlexFlow::tensor_guid_t, ::FlexFlow::gradient_tensor_t>" + +[[fields]] +name = "tensor_optimizer_mapping" +type = "std::unordered_map<::FlexFlow::tensor_guid_t, std::vector<::FlexFlow::optimizer_tensor_t>>" \ No newline at end of file diff --git a/lib/realm-backend/include/realm-backend/realm_training_backing.h b/lib/realm-backend/include/realm-backend/realm_training_backing.h index 1b756b14d3..ee426324cb 100644 --- a/lib/realm-backend/include/realm-backend/realm_training_backing.h +++ b/lib/realm-backend/include/realm-backend/realm_training_backing.h @@ -6,7 +6,8 @@ #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" #include "pcg/computation_graph.dtg.h" #include "pcg/optimizer_attrs.dtg.h" -#include "local-execution/allocated_tensors.dtg.h" +#include "local-execution/allocated_tensors.h" +#include "local-execution/unallocated_tensors.h" #include "realm-backend/driver.h" #include "realm-backend/realm_allocator.h" #include "realm-backend/realm_args_backing.h" @@ -67,7 +68,8 @@ Future execute_update(RealmTrainingBacking &, layer_guid_t const &, TaskArgumentAccessor get_task_arg_accessor(RealmTensorBacking const &, RealmArgsBacking const &, - TaskInvocation const &); + TaskInvocation const &, + Allocator &); } // namespace FlexFlow diff --git a/lib/realm-backend/src/model_training_instance.cc b/lib/realm-backend/src/model_training_instance.cc index aa8c30b34f..8ced02e95a 100644 --- a/lib/realm-backend/src/model_training_instance.cc +++ b/lib/realm-backend/src/model_training_instance.cc @@ -15,17 +15,17 @@ namespace FlexFlow { optimizer_attrs(optimizer_attrs), logit_tensor(logit_tensor), label_tensor(label_tensor){}; -PerLayerElapsedTime forward(ModelTrainingInstance &model_training_instance) { +PerLayerElapsedTime ModelTrainingInstance::forward() { PerLayerElapsedTime per_layer_elapsed_time; std::unordered_map> per_layer_elapsed_time_future; for (layer_guid_t const &node : topological_ordering( - model_training_instance.training_backing.computation_graph)) { + this->training_backing.computation_graph)) { per_layer_elapsed_time_future.insert( - {node, execute_forward(model_training_instance.training_backing, node)}); + {node, execute_forward(this->training_backing, node)}); } for (layer_guid_t const &node : topological_ordering( - model_training_instance.training_backing.computation_graph)) { + this->training_backing.computation_graph)) { float elapsed_time = per_layer_elapsed_time_future[node].get(); per_layer_elapsed_time.insert({node, elapsed_time}); @@ -33,22 +33,22 @@ PerLayerElapsedTime forward(ModelTrainingInstance &model_training_instance) { return per_layer_elapsed_time; } -PerLayerElapsedTime backward(ModelTrainingInstance &model_training_instance) { - compute_loss(model_training_instance.training_backing, - model_training_instance.loss_attrs, - model_training_instance.logit_tensor, - model_training_instance.label_tensor); +PerLayerElapsedTime ModelTrainingInstance::backward() { + compute_loss(this->training_backing, + this->loss_attrs, + this->logit_tensor, + this->label_tensor); PerLayerElapsedTime per_layer_elapsed_time; std::unordered_map> per_layer_elapsed_time_future; for (layer_guid_t const &node : reversed(topological_ordering( - model_training_instance.training_backing.computation_graph))) { + this->training_backing.computation_graph))) { per_layer_elapsed_time_future.insert( - {node, execute_backward(model_training_instance.training_backing, node)}); + {node, execute_backward(this->training_backing, node)}); } for (layer_guid_t const &node : reversed(topological_ordering( - model_training_instance.training_backing.computation_graph))) { + this->training_backing.computation_graph))) { float elapsed_time = per_layer_elapsed_time_future[node].get(); per_layer_elapsed_time.insert({node, elapsed_time}); @@ -56,21 +56,21 @@ PerLayerElapsedTime backward(ModelTrainingInstance &model_training_instance) { return per_layer_elapsed_time; } -void update(ModelTrainingInstance &model_training_instance) { +void ModelTrainingInstance::update() { std::unordered_map> per_layer_update_future; for (layer_guid_t const &node : topological_ordering( - model_training_instance.training_backing.computation_graph)) { + this->training_backing.computation_graph)) { per_layer_update_future.insert( - {node, execute_update(model_training_instance.training_backing, + {node, execute_update(this->training_backing, node, - model_training_instance.optimizer_attrs)}); + this->optimizer_attrs)}); } for (layer_guid_t const &node : topological_ordering( - model_training_instance.training_backing.computation_graph)) { + this->training_backing.computation_graph)) { per_layer_update_future[node].wait(); } - model_training_instance.optimizer_attrs = get_optimizer_attrs_for_next_iter( - model_training_instance.optimizer_attrs); + this->optimizer_attrs = get_optimizer_attrs_for_next_iter( + this->optimizer_attrs); } } // namespace FlexFlow diff --git a/lib/realm-backend/src/realm_task_argument_accessor.cc b/lib/realm-backend/src/realm_task_argument_accessor.cc index 7b27bad6c2..c7e81da01d 100644 --- a/lib/realm-backend/src/realm_task_argument_accessor.cc +++ b/lib/realm-backend/src/realm_task_argument_accessor.cc @@ -57,37 +57,6 @@ Allocator RealmTaskArgumentAccessor::get_allocator() const { return this->allocator; } -TensorSlotsBackingWithoutAddresses - get_slots_backing_without_tensor_allocation_addresses( - TensorSlotsBacking const &slots_backing) { - - TensorSlotsBackingWithoutAddresses addressless_slots_backing; - - using TensorAccessorVariant = - std::variant>; - for (auto const &slot_tensor : slots_backing) { - TensorAccessorVariant accessor_variant = slot_tensor.second; - std::visit( - overload{ - [&](GenericTensorAccessorW const &accessor) { - addressless_slots_backing.insert( - {slot_tensor.first, get_shape_and_datatype(accessor)}); - }, - [&](std::vector const &variadic_accessor) { - std::vector> - variadic_addressless_accessor = - transform(variadic_accessor, - [](GenericTensorAccessorW const &accessor) { - return get_shape_and_datatype(accessor); - }); - addressless_slots_backing.insert( - {slot_tensor.first, variadic_addressless_accessor}); - }}, - accessor_variant); - } - return addressless_slots_backing; -} - size_t RealmTaskArgumentAccessor::get_device_idx() const { return 0; } diff --git a/lib/realm-backend/src/realm_tensor_backing.cc b/lib/realm-backend/src/realm_tensor_backing.cc index 12d0973fba..5dcfa8cef8 100644 --- a/lib/realm-backend/src/realm_tensor_backing.cc +++ b/lib/realm-backend/src/realm_tensor_backing.cc @@ -1,9 +1,6 @@ #include "op-attrs/parallel_tensor_shape.h" -#include "op-attrs/tensor_shape.h" #include "pcg/computation_graph.h" #include "pcg/optimizer_attrs.h" -#include "local-execution/allocated_tensors.h" -#include "realm-backend/realm_allocator.h" #include "realm-backend/realm_tensor_backing.h" #include "task-spec/slot_grad_id.dtg.h" #include "utils/containers/contains_key.h" @@ -12,190 +9,72 @@ namespace FlexFlow { -RealmTensorBacking::RealmTensorBacking( - AllocatedTensors const &allocated_tensors, - UnallocatedTensors const &unallocated_tensors, - Allocator const &allocator) - : tensor_gradient_mapping(allocated_tensors.gradient_mapping), - tensor_optimizer_mapping(allocated_tensors.optimizer_mapping), - allocator(allocator) { - - // handle already-allocated tensors - for (std::pair const - &tensor_type_backing : allocated_tensors.tensor_type_backings) { - lowered_tensor_t lowered_tensor = - this->insert_tensor(tensor_type_backing.first); - this->tensor_backings.insert({lowered_tensor, tensor_type_backing.second}); - } - - // allocate new tensors - this->tensor_gradient_mapping.insert( - unallocated_tensors.gradient_mapping.begin(), - unallocated_tensors.gradient_mapping.end()); +GenericTensorAccessorW +get_tensor(RealmTensorBacking const &realm_tensor_backing, + TensorTypeVariant const &tensor_type) { + return realm_tensor_backing.tensor_backings.at(tensor_type); +} +std::unordered_map> +merge_optimizer_mappings( + std::unordered_map> const + &allocated, + std::unordered_map> const + &unallocated) { + std::unordered_map> + merged_maps = allocated; for (std::pair> const - &unallocated_optimizer_tensors : - unallocated_tensors.optimizer_mapping) { - if (this->tensor_optimizer_mapping.count( - unallocated_optimizer_tensors.first)) { + &unallocated_optimizer_tensors : unallocated) { + if (merged_maps.count(unallocated_optimizer_tensors.first)) { for (optimizer_tensor_t const &optimizer_tensor : unallocated_optimizer_tensors.second) { - this->tensor_optimizer_mapping[unallocated_optimizer_tensors.first] - .push_back(optimizer_tensor); + merged_maps[unallocated_optimizer_tensors.first].push_back( + optimizer_tensor); } } else { - this->tensor_optimizer_mapping.insert({unallocated_optimizer_tensors}); + merged_maps.insert({unallocated_optimizer_tensors}); } } - - for (std::pair const &tensor_type_shape : - unallocated_tensors.tensor_type_shapes) { - lowered_tensor_t lowered_tensor = - this->insert_tensor(tensor_type_shape.first); - GenericTensorAccessorW tensor_backing = - this->allocator.allocate_tensor(tensor_type_shape.second); - this->tensor_backings.insert({lowered_tensor, tensor_backing}); - } -}; - -lowered_tensor_t -RealmTensorBacking::insert_tensor(TensorTypeVariant const &tensor_type) { - lowered_tensor_t lowered_tensor = - this->lowered_tensor_source.new_lowered_tensor(); - tensor_type.visit(overload{ - [&](tensor_guid_t const &tensor_guid) { - this->tensor_lowering_mapping.insert({tensor_guid, lowered_tensor}); - return std::nullopt; - }, - [&](gradient_tensor_t const &gradient_tensor) { - this->gradient_tensor_lowering_mapping.insert( - {gradient_tensor, lowered_tensor}); - return std::nullopt; - }, - [&](optimizer_tensor_t const &optimizer_tensor) { - this->optimizer_tensor_lowering_mapping.insert( - {optimizer_tensor, lowered_tensor}); - return std::nullopt; - }, - [&](loss_tensor_t const &loss_tensor) { - this->loss_tensor_lowering_mapping.insert( - {loss_tensor, lowered_tensor}); - return std::nullopt; - }, - [&](auto const &any_tensor) { - throw mk_runtime_error( - fmt::format("Unhandled tensor type {}", any_tensor)); - }}); - return lowered_tensor; -} - -GenericTensorAccessorW -RealmTensorBacking::get_tensor(TensorTypeVariant const &tensor_type) const { - lowered_tensor_t lowered_tensor = - tensor_type.visit(overload{ - [&](tensor_guid_t const &tensor_guid) { - return this->tensor_lowering_mapping.at(tensor_guid); - }, - [&](gradient_tensor_t const &gradient_tensor) { - return this->gradient_tensor_lowering_mapping.at(gradient_tensor); - }, - [&](optimizer_tensor_t const &optimizer_tensor) { - return this->optimizer_tensor_lowering_mapping.at(optimizer_tensor); - }, - [&](loss_tensor_t const &loss_tensor) { - return this->loss_tensor_lowering_mapping.at(loss_tensor); - }, - [&](auto const &any_tensor) { - throw mk_runtime_error( - fmt::format("Unhandled tensor type {}", any_tensor)); - }}); - return this->tensor_backings.at(lowered_tensor); + return merged_maps; } -UnallocatedTensors generate_unallocated_tensors( - AllocatedTensors const &allocated_tensors, - std::unordered_map const &tensor_attrs_mapping, - GradientTensorSource &gradient_tensor_source) { - - assert(are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping)); - - std::unordered_map tensor_type_shapes; - std::unordered_map gradient_mapping; - - for (std::pair const &tensor_guid_attrs : - tensor_attrs_mapping) { - tensor_guid_t tensor_guid = tensor_guid_attrs.first; - TensorAttrs tensor_attrs = tensor_guid_attrs.second; - TensorTypeVariant tensor_guid_type = TensorTypeVariant{tensor_guid}; - if (!allocated_tensors.tensor_type_backings.count(tensor_guid_type)) { - tensor_type_shapes.insert({tensor_guid_type, tensor_attrs.shape}); - } +std::unordered_map +get_tensor_backings( + std::unordered_map const + &tensor_type_backings, + std::unordered_map const + &tensor_type_shapes, + Allocator &allocator) { + std::unordered_map + all_tensor_backings = tensor_type_backings; - if (tensor_attrs.create_gradients == CreateGrad::YES && - !allocated_tensors.gradient_mapping.count(tensor_guid)) { - gradient_tensor_t gradient_tensor = - gradient_tensor_source.new_gradient_tensor(); - tensor_type_shapes.insert( - {TensorTypeVariant{gradient_tensor}, tensor_attrs.shape}); - gradient_mapping.insert({tensor_guid, gradient_tensor}); - } + // allocate new tensors + for (std::pair const &tensor_type_shape : + tensor_type_shapes) { + GenericTensorAccessorW tensor_backing = + allocator.allocate_tensor(tensor_type_shape.second); + all_tensor_backings.insert({tensor_type_shape.first, tensor_backing}); } - return UnallocatedTensors{tensor_type_shapes, gradient_mapping, {}}; + return all_tensor_backings; } -UnallocatedTensors generate_unallocated_tensors_with_optimizer( - AllocatedTensors const &allocated_tensors, - std::unordered_map const &tensor_attrs_mapping, - GradientTensorSource &gradient_tensor_source, - OptimizerTensorSource &optimizer_tensor_source, - OptimizerAttrs const &optimizer_attrs) { - - UnallocatedTensors unallocated_tensors = generate_unallocated_tensors( - allocated_tensors, tensor_attrs_mapping, gradient_tensor_source); - - if (!get_num_optimizer_tensors(optimizer_attrs)) { - return unallocated_tensors; - } - - std::unordered_map tensor_type_shapes = - unallocated_tensors.tensor_type_shapes; - std::unordered_map gradient_mapping = - unallocated_tensors.gradient_mapping; - std::unordered_map> - optimizer_mapping; - - for (std::pair const &tensor_guid_attrs : - tensor_attrs_mapping) { - tensor_guid_t tensor_guid = tensor_guid_attrs.first; - TensorAttrs tensor_attrs = tensor_guid_attrs.second; - if (tensor_attrs.create_gradients == CreateGrad::YES) { - std::vector optimizer_tensors; - - int num_optimizer_tensors_to_allocate = - get_num_optimizer_tensors(optimizer_attrs); - if (allocated_tensors.optimizer_mapping.count(tensor_guid)) { - num_optimizer_tensors_to_allocate -= - allocated_tensors.optimizer_mapping.at(tensor_guid).size(); - } - std::cout << num_optimizer_tensors_to_allocate; - - for (int i = 0; i < num_optimizer_tensors_to_allocate; ++i) { - optimizer_tensor_t optimizer_tensor = - optimizer_tensor_source.new_optimizer_tensor(); - optimizer_tensors.push_back(optimizer_tensor); - tensor_type_shapes.insert( - {TensorTypeVariant{optimizer_tensor}, tensor_attrs.shape}); - } - - if (num_optimizer_tensors_to_allocate > 0) { - optimizer_mapping.insert({tensor_guid, optimizer_tensors}); - } - } - } - - return UnallocatedTensors{tensor_type_shapes, gradient_mapping, - optimizer_mapping}; +RealmTensorBacking +construct_realm_tensor_backing(AllocatedTensors const &allocated_tensors, + UnallocatedTensors const &unallocated_tensors, + Allocator &allocator) { + + std::unordered_map merged_gradient_maps = + allocated_tensors.gradient_mapping; + merged_gradient_maps.insert(unallocated_tensors.gradient_mapping.begin(), + unallocated_tensors.gradient_mapping.end()); + + return RealmTensorBacking{ + get_tensor_backings(allocated_tensors.tensor_type_backings, + unallocated_tensors.tensor_type_shapes, allocator), + merged_gradient_maps, + merge_optimizer_mappings(allocated_tensors.optimizer_mapping, + unallocated_tensors.optimizer_mapping)}; } TensorSlotsBacking @@ -206,10 +85,10 @@ construct_tensor_slots_backing(RealmTensorBacking const &realm_tensor_backing, for (std::pair const &tensor_binding : binding.get_tensor_bindings()) { mapping.insert({tensor_binding.first, - realm_tensor_backing.get_tensor(tensor_binding.second)}); + get_tensor(realm_tensor_backing, tensor_binding.second)}); } return mapping; } -} // namespace FlexFlow +} // namespace FlexFlow \ No newline at end of file diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc index d0b985921e..9da921d097 100644 --- a/lib/realm-backend/src/realm_training_backing.cc +++ b/lib/realm-backend/src/realm_training_backing.cc @@ -1,3 +1,4 @@ +#include "kernels/allocation.h" #include "local-execution/loss_functions.h" #include "local-execution/optimizer.h" #include "local-execution/task_signature_impl.h" @@ -32,12 +33,12 @@ RealmTrainingBacking::RealmTrainingBacking( allocators(allocators), computation_graph(computation_graph), task_registry(construct_task_registry( get_layer_attrs_mapping(this->computation_graph))), - realm_tensor_backing(RealmTensorBacking( // TODO: multi gpu - allocated_tensors, - generate_unallocated_tensors( - allocated_tensors, get_all_tensor_attrs(this->computation_graph), - this->gradient_tensor_source), - this->allocators[0])), + realm_tensor_backing(construct_realm_tensor_backing( // TODO: multi gpu + allocated_tensors, + generate_unallocated_tensors( + allocated_tensors, get_all_tensor_attrs(this->computation_graph), + this->gradient_tensor_source), + this->allocators[0])), realm_args_backing(initialize_args_backing(this, runtime_arg_config)) { master_event = Realm::Event::NO_EVENT; master_mem = Machine::MemoryQuery(Machine::get_machine()) @@ -78,13 +79,13 @@ RealmTrainingBacking::RealmTrainingBacking( allocators(allocators), computation_graph(computation_graph), task_registry(construct_task_registry( get_layer_attrs_mapping(this->computation_graph))), - realm_tensor_backing(RealmTensorBacking( // TODO: multi gpu - allocated_tensors, - generate_unallocated_tensors_with_optimizer( - allocated_tensors, get_all_tensor_attrs(this->computation_graph), - this->gradient_tensor_source, this->optimizer_tensor_source, - optimizer_attrs), - this->allocators[0])), + realm_tensor_backing(construct_realm_tensor_backing( // TODO: multi gpu + allocated_tensors, + generate_unallocated_tensors_with_optimizer( + allocated_tensors, get_all_tensor_attrs(this->computation_graph), + this->gradient_tensor_source, this->optimizer_tensor_source, + optimizer_attrs), + this->allocators[0])), realm_args_backing(initialize_args_backing(this, runtime_arg_config)) { master_event = Realm::Event::NO_EVENT; master_mem = Machine::MemoryQuery(Machine::get_machine()) @@ -127,6 +128,8 @@ initialize_args_backing(RealmTrainingBacking *backing, Memory master_mem = backing->master_mem; std::vector &worker_procs = backing->worker_procs; std::vector &worker_events = backing->worker_events; + // TODO: multi gpu + Allocator &allocator = backing->allocators[0]; for (layer_guid_t const &node : topological_ordering(cg)) { if (registry_contains_task_for_layer(task_registry, node, @@ -141,7 +144,8 @@ initialize_args_backing(RealmTrainingBacking *backing, TaskArgumentAccessor accessor = get_task_arg_accessor( realm_tensor_backing, make_args_backing_with_empty_device_states(runtime_arg_config), - invocation); + invocation, + allocator); task_id_t task_id = invocation.task_id; TaskImplFunction impl_function = task_registry.task_mapping.at(task_id).impl_function; @@ -187,7 +191,8 @@ execute_forward(RealmTrainingBacking &realm_training_backing, device_state); TaskArgumentAccessor accessor = get_task_arg_accessor( realm_training_backing.realm_tensor_backing, - realm_training_backing.realm_args_backing, invocation); + realm_training_backing.realm_args_backing, invocation, + realm_training_backing.allocators[0]); task_id_t task_id = invocation.task_id; TaskImplFunction impl_function = realm_training_backing.task_registry.task_mapping.at(task_id) @@ -233,7 +238,8 @@ execute_backward(RealmTrainingBacking &realm_training_backing, device_state); TaskArgumentAccessor accessor = get_task_arg_accessor( realm_training_backing.realm_tensor_backing, - realm_training_backing.realm_args_backing, invocation); + realm_training_backing.realm_args_backing, invocation, + realm_training_backing.allocators[0]); task_id_t task_id = invocation.task_id; TaskImplFunction impl_function = realm_training_backing.task_registry.task_mapping.at(task_id) @@ -282,7 +288,8 @@ Future execute_update(RealmTrainingBacking &realm_training_backing, // execute update TaskArgumentAccessor accessor = get_task_arg_accessor( realm_training_backing.realm_tensor_backing, - realm_training_backing.realm_args_backing, invocation); + realm_training_backing.realm_args_backing, invocation, + realm_training_backing.allocators[0]); task_id_t task_id = invocation.task_id; register_wrapper_tasks_generic(realm_training_backing.worker_procs[0], task_id); @@ -316,7 +323,8 @@ Future compute_loss(RealmTrainingBacking &realm_training_backing, // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation)); TaskArgumentAccessor loss_accessor = get_task_arg_accessor( realm_training_backing.realm_tensor_backing, - realm_training_backing.realm_args_backing, loss_invocation); + realm_training_backing.realm_args_backing, loss_invocation, + realm_training_backing.allocators[0]); task_id_t task_id = loss_invocation.task_id; register_wrapper_tasks_generic(realm_training_backing.worker_procs[0], task_id); @@ -337,14 +345,15 @@ Future compute_loss(RealmTrainingBacking &realm_training_backing, TaskArgumentAccessor get_task_arg_accessor(RealmTensorBacking const &realm_tensor_backing, RealmArgsBacking const &realm_args_backing, - TaskInvocation const &invocation) { + TaskInvocation const &invocation, + Allocator &allocator) { TensorSlotsBacking tensor_slots_backing = construct_tensor_slots_backing(realm_tensor_backing, invocation.binding); ArgSlotsBacking arg_slots_backing = construct_arg_slots_backing( invocation.binding, realm_args_backing.runtime_arg_config); // TODO: multi gpu return TaskArgumentAccessor::create( - realm_tensor_backing.allocator, tensor_slots_backing, arg_slots_backing); + allocator, tensor_slots_backing, arg_slots_backing); } } // namespace FlexFlow From 7c53bb31a9f969d0ed72cc2bfbe3d9005be045c9 Mon Sep 17 00:00:00 2001 From: fruitea Date: Wed, 5 Mar 2025 01:07:15 -0800 Subject: [PATCH 57/91] chore: minor --- .../realm-backend/realm_training_backing.h | 7 +++-- .../src/realm_training_backing.cc | 31 ++++++++++--------- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/lib/realm-backend/include/realm-backend/realm_training_backing.h b/lib/realm-backend/include/realm-backend/realm_training_backing.h index ee426324cb..c695dc1a46 100644 --- a/lib/realm-backend/include/realm-backend/realm_training_backing.h +++ b/lib/realm-backend/include/realm-backend/realm_training_backing.h @@ -24,12 +24,15 @@ struct RealmTrainingBacking { std::vector const &, std::vector const &, AllocatedTensors const &, + GradientTensorSource &, ComputationGraph const &, RuntimeArgConfig const &); RealmTrainingBacking(Realm::Processor, std::vector const &, std::vector const &, AllocatedTensors const &, + GradientTensorSource &, + OptimizerTensorSource &, ComputationGraph const &, RuntimeArgConfig const &, OptimizerAttrs const &); @@ -47,12 +50,10 @@ struct RealmTrainingBacking { ComputationGraph computation_graph; TaskRegistry task_registry; - - GradientTensorSource gradient_tensor_source; - OptimizerTensorSource optimizer_tensor_source; }; RealmArgsBacking initialize_args_backing(RealmTrainingBacking *, + ComputationGraph const &, RuntimeArgConfig const &); void execute_init(RealmTrainingBacking &, layer_guid_t const &); diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc index 4c50548fa9..f03f788345 100644 --- a/lib/realm-backend/src/realm_training_backing.cc +++ b/lib/realm-backend/src/realm_training_backing.cc @@ -27,19 +27,20 @@ RealmTrainingBacking::RealmTrainingBacking( Processor master_proc, std::vector const &worker_procs, std::vector const &allocators, AllocatedTensors const &allocated_tensors, + GradientTensorSource &gradient_tensor_source, ComputationGraph const &computation_graph, RuntimeArgConfig const &runtime_arg_config) : master_proc(master_proc), worker_procs(worker_procs), allocators(allocators), computation_graph(computation_graph), task_registry(construct_task_registry( - get_layer_attrs_mapping(this->computation_graph))), + get_layer_attrs_mapping(computation_graph))), realm_tensor_backing(construct_realm_tensor_backing( // TODO: multi gpu allocated_tensors, generate_unallocated_tensors( - allocated_tensors, get_all_tensor_attrs(this->computation_graph), - this->gradient_tensor_source), + allocated_tensors, get_all_tensor_attrs(computation_graph), + gradient_tensor_source), this->allocators[0])), - realm_args_backing(initialize_args_backing(this, runtime_arg_config)) { + realm_args_backing(initialize_args_backing(this, computation_graph, runtime_arg_config)) { master_event = Realm::Event::NO_EVENT; master_mem = Machine::MemoryQuery(Machine::get_machine()) .only_kind(Memory::SYSTEM_MEM) @@ -58,7 +59,7 @@ RealmTrainingBacking::RealmTrainingBacking( get_layer_attrs_mapping(this->computation_graph); for (std::pair const &layer_attrs : layer_attrs_mapping) { - ComputationGraphOpAttrs attrs = layer_attrs.second.attrs; + ComputationGraphOpAttrs attrs = layer_attrs.second.op_attrs; std::vector task_ids = get_task_ids(attrs); for (task_id_t task_id : task_ids) { TaskSignatureAndImpl task_signature_impl = get_task_sig_impl(task_id); @@ -72,21 +73,23 @@ RealmTrainingBacking::RealmTrainingBacking( Processor master_proc, std::vector const &worker_procs, std::vector const &allocators, AllocatedTensors const &allocated_tensors, + GradientTensorSource &gradient_tensor_source, + OptimizerTensorSource &optimizer_tensor_source, ComputationGraph const &computation_graph, RuntimeArgConfig const &runtime_arg_config, OptimizerAttrs const &optimizer_attrs) : master_proc(master_proc), worker_procs(worker_procs), allocators(allocators), computation_graph(computation_graph), task_registry(construct_task_registry( - get_layer_attrs_mapping(this->computation_graph))), + get_layer_attrs_mapping(computation_graph))), realm_tensor_backing(construct_realm_tensor_backing( // TODO: multi gpu allocated_tensors, generate_unallocated_tensors_with_optimizer( - allocated_tensors, get_all_tensor_attrs(this->computation_graph), - this->gradient_tensor_source, this->optimizer_tensor_source, + allocated_tensors, get_all_tensor_attrs(computation_graph), + gradient_tensor_source, optimizer_tensor_source, optimizer_attrs), this->allocators[0])), - realm_args_backing(initialize_args_backing(this, runtime_arg_config)) { + realm_args_backing(initialize_args_backing(this, computation_graph, runtime_arg_config)) { master_event = Realm::Event::NO_EVENT; master_mem = Machine::MemoryQuery(Machine::get_machine()) .only_kind(Memory::SYSTEM_MEM) @@ -101,7 +104,7 @@ RealmTrainingBacking::RealmTrainingBacking( get_layer_attrs_mapping(this->computation_graph); for (std::pair const &layer_attrs : layer_attrs_mapping) { - ComputationGraphOpAttrs attrs = layer_attrs.second.attrs; + ComputationGraphOpAttrs attrs = layer_attrs.second.op_attrs; std::vector task_ids = get_task_ids(attrs); for (task_id_t task_id : task_ids) { TaskSignatureAndImpl task_signature_impl = get_task_sig_impl(task_id); @@ -113,6 +116,7 @@ RealmTrainingBacking::RealmTrainingBacking( RealmArgsBacking initialize_args_backing(RealmTrainingBacking *backing, + ComputationGraph const &cg, RuntimeArgConfig const &runtime_arg_config) { // initialize_args_backing(TaskRegistry const &task_registry, // ComputationGraph const &cg, @@ -121,7 +125,6 @@ initialize_args_backing(RealmTrainingBacking *backing, std::unordered_map per_device_op_states; TaskRegistry const &task_registry = backing->task_registry; - ComputationGraph const &cg = backing->computation_graph; RealmTensorBacking const &realm_tensor_backing = backing->realm_tensor_backing; Processor master_proc = backing->master_proc; @@ -134,7 +137,7 @@ initialize_args_backing(RealmTrainingBacking *backing, for (layer_guid_t const &node : topological_ordering(cg)) { if (registry_contains_task_for_layer(task_registry, node, OpTaskType::INIT)) { - ComputationGraphOpAttrs attrs = get_layer_attrs(cg, node).attrs; + ComputationGraphOpAttrs attrs = get_layer_attrs(cg, node).op_attrs; TaskInvocation invocation = lower_to_task_invocation( init(attrs), node, get_incoming_inputs(cg, node), @@ -173,7 +176,7 @@ execute_forward(RealmTrainingBacking &realm_training_backing, operator_node, OpTaskType::FWD)) { ComputationGraphOpAttrs attrs = get_layer_attrs(realm_training_backing.computation_graph, operator_node) - .attrs; + .op_attrs; std::optional device_state = get_per_device_op_state_if_exists( realm_training_backing.realm_args_backing, operator_node); @@ -220,7 +223,7 @@ execute_backward(RealmTrainingBacking &realm_training_backing, operator_node, OpTaskType::BWD)) { ComputationGraphOpAttrs attrs = get_layer_attrs(realm_training_backing.computation_graph, operator_node) - .attrs; + .op_attrs; std::optional device_state = get_per_device_op_state_if_exists( realm_training_backing.realm_args_backing, operator_node); From bf57d1dfe2e6f4fbabeb2170f386bd399a080c7c Mon Sep 17 00:00:00 2001 From: fruitea Date: Wed, 5 Mar 2025 01:17:51 -0800 Subject: [PATCH 58/91] chore: remove deprecated file --- lib/realm-backend/src/allocated_tensors.cc | 141 --------------------- 1 file changed, 141 deletions(-) delete mode 100644 lib/realm-backend/src/allocated_tensors.cc diff --git a/lib/realm-backend/src/allocated_tensors.cc b/lib/realm-backend/src/allocated_tensors.cc deleted file mode 100644 index 3e249bf6d1..0000000000 --- a/lib/realm-backend/src/allocated_tensors.cc +++ /dev/null @@ -1,141 +0,0 @@ -#include "local-execution/allocated_tensors.h" -#include "pcg/optimizer_attrs.h" -#include "utils/containers/keys.h" -#include "utils/containers/set_union.h" - -namespace FlexFlow { - -bool is_allocated_tensor_backing_valid( - TensorTypeVariant const &tensor_type, - std::unordered_map const - &allocated_tensor_backings, - ArrayShape const &expected_shape) { - if (allocated_tensor_backings.count(tensor_type)) { - GenericTensorAccessorW tensor_backing = - allocated_tensor_backings.at(tensor_type); - if (expected_shape == tensor_backing.shape) { - return true; - } - } - return false; -}; - -bool are_allocated_forward_tensors_valid( - AllocatedTensors const &allocated_tensors, - std::unordered_map const &tensor_attrs) { - - std::unordered_set all_tensor_guids = transform( - keys(filter_keys( - allocated_tensors.tensor_type_backings, - [&](TensorTypeVariant const &k) { return k.has(); })), - [&](TensorTypeVariant const &t) { return t.get(); }); - - for (tensor_guid_t const &tensor_guid : all_tensor_guids) { - if (tensor_attrs.count(tensor_guid)) { - if (!is_allocated_tensor_backing_valid( - TensorTypeVariant{tensor_guid}, - allocated_tensors.tensor_type_backings, - ArrayShape{tensor_attrs.at(tensor_guid).shape})) { - return false; - } - } else { - return false; - } - } - return true; -} - -bool are_allocated_gradient_tensors_valid( - AllocatedTensors const &allocated_tensors, - std::unordered_map const &tensor_attrs) { - std::unordered_set - tensors_in_mappings; // will check for dangling gradient tensors - - for (std::pair const &tensor_to_grad : - allocated_tensors.gradient_mapping) { - if (tensor_attrs.count(tensor_to_grad.first)) { - if (tensor_attrs.at(tensor_to_grad.first).create_gradients == - CreateGrad::NO) { - return false; - } - - ArrayShape tensor_guid_array_shape = - ArrayShape{tensor_attrs.at(tensor_to_grad.first).shape}; - TensorTypeVariant gradient_tensor = - TensorTypeVariant{tensor_to_grad.second}; - if (is_allocated_tensor_backing_valid( - gradient_tensor, - allocated_tensors.tensor_type_backings, - tensor_guid_array_shape)) { - tensors_in_mappings.insert(gradient_tensor); - } else { - return false; - } - } else { - return false; - } - } - - for (TensorTypeVariant const &tensor_type : - keys(allocated_tensors.tensor_type_backings)) { - if (tensor_type.has()) { - if (!tensors_in_mappings.count(tensor_type)) { - return false; - } - } - } - return true; -} - -bool are_allocated_optimizer_tensors_valid( - AllocatedTensors const &allocated_tensors, - std::unordered_map const &tensor_attrs) { - std::unordered_set - tensors_in_mappings; // will check for dangling optimizer tensors - - for (std::pair> const - &tensor_to_optimizers : allocated_tensors.optimizer_mapping) { - if (tensor_attrs.count(tensor_to_optimizers.first)) { - if (tensor_attrs.at(tensor_to_optimizers.first).create_gradients == - CreateGrad::NO) { - return false; - } - - ArrayShape tensor_guid_array_shape = - ArrayShape{tensor_attrs.at(tensor_to_optimizers.first).shape}; - for (optimizer_tensor_t const &optimizer_tensor : - tensor_to_optimizers.second) { - if (is_allocated_tensor_backing_valid( - TensorTypeVariant{optimizer_tensor}, - allocated_tensors.tensor_type_backings, - tensor_guid_array_shape)) { - tensors_in_mappings.insert(TensorTypeVariant{optimizer_tensor}); - } else { - return false; - } - } - } - } - - for (TensorTypeVariant const &tensor_type : - keys(allocated_tensors.tensor_type_backings)) { - if (tensor_type.has()) { - if (!tensors_in_mappings.count(tensor_type)) { - return false; - } - } - } - - return true; -} - -bool are_allocated_tensors_valid( - AllocatedTensors const &allocated_tensors, - std::unordered_map const &tensor_attrs) { - return are_allocated_forward_tensors_valid(allocated_tensors, tensor_attrs) && - are_allocated_gradient_tensors_valid(allocated_tensors, - tensor_attrs) && - are_allocated_optimizer_tensors_valid(allocated_tensors, tensor_attrs); -} - -} // namespace FlexFlow From 3a0d4e85b2937de8ab8a97bfae688c4f8a0808ea Mon Sep 17 00:00:00 2001 From: fruitea Date: Wed, 12 Mar 2025 12:02:49 -0700 Subject: [PATCH 59/91] feat: add a unit test for realm backend --- .proj.toml | 2 +- lib/realm-backend/CMakeLists.txt | 2 +- .../src/realm_training_backing.cc | 4 - lib/realm-backend/test/CMakeLists.txt | 8 +- .../test/modify_test_commands.cmake | 21 +++ lib/realm-backend/test/src/test_update.cc | 120 ++++++++++++++++++ lib/realm-backend/test/src/test_utils.cc | 19 +++ lib/realm-backend/test/src/test_utils.h | 23 ++++ 8 files changed, 192 insertions(+), 7 deletions(-) create mode 100644 lib/realm-backend/test/modify_test_commands.cmake create mode 100644 lib/realm-backend/test/src/test_update.cc create mode 100644 lib/realm-backend/test/src/test_utils.cc create mode 100644 lib/realm-backend/test/src/test_utils.h diff --git a/.proj.toml b/.proj.toml index 66caad7e4c..3581b3b2c7 100644 --- a/.proj.toml +++ b/.proj.toml @@ -28,7 +28,7 @@ test_targets = [ "compiler-tests", "substitution-generator-tests", "local-execution-tests", - #"realm-backend-tests", + "realm-backend-tests", "models-tests", ] diff --git a/lib/realm-backend/CMakeLists.txt b/lib/realm-backend/CMakeLists.txt index 436d8cc8b0..623816567e 100644 --- a/lib/realm-backend/CMakeLists.txt +++ b/lib/realm-backend/CMakeLists.txt @@ -17,4 +17,4 @@ ff_add_library( legion ) -# add_subdirectory(test) +add_subdirectory(test) diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc index f03f788345..17463ec4ec 100644 --- a/lib/realm-backend/src/realm_training_backing.cc +++ b/lib/realm-backend/src/realm_training_backing.cc @@ -49,10 +49,6 @@ RealmTrainingBacking::RealmTrainingBacking( for (Processor p : worker_procs) { worker_events.push_back(Realm::Event::NO_EVENT); } - // Machine::ProcessorQuery pq = - // Machine::ProcessorQuery(Machine::get_machine()) - // .only_kind(Processor::TOC_PROC); - // allocators.push_back(create_realm_memory_allocator(p)); // register tasks for realm std::unordered_map const &layer_attrs_mapping = diff --git a/lib/realm-backend/test/CMakeLists.txt b/lib/realm-backend/test/CMakeLists.txt index 965f2e04b2..e180208fbc 100644 --- a/lib/realm-backend/test/CMakeLists.txt +++ b/lib/realm-backend/test/CMakeLists.txt @@ -6,9 +6,15 @@ ff_add_test_executable( PRIVATE_INCLUDE src/ DEPS - doctest utils-test-common realm-backend kernels op-attrs ) + +set(FF_TEST_EXEC_NAME "realm-backend-tests") +add_custom_command( + TARGET ${FF_TEST_EXEC_NAME} POST_BUILD + COMMAND ${CMAKE_COMMAND} -DFF_TEST_EXEC_NAME=${FF_TEST_EXEC_NAME} -P ${CMAKE_CURRENT_LIST_DIR}/modify_test_commands.cmake + DEPENDS ${FF_TEST_EXEC_NAME} +) diff --git a/lib/realm-backend/test/modify_test_commands.cmake b/lib/realm-backend/test/modify_test_commands.cmake new file mode 100644 index 0000000000..6494ae2d78 --- /dev/null +++ b/lib/realm-backend/test/modify_test_commands.cmake @@ -0,0 +1,21 @@ +# modify_test_commands.cmake + +file(GLOB ctest_tests_files "${CMAKE_CURRENT_BINARY_DIR}/${FF_TEST_EXEC_NAME}_tests-*.cmake") + +foreach(ctest_tests_file IN LISTS ctest_tests_files) + file(READ "${ctest_tests_file}" content) + + # add nix run prefix + string(REGEX REPLACE + "add_test\\([ \t\r\n]*\\[==\\[([^]]+)\\]==\\][ \t\r\n]+([^ ]+)[ \t\r\n]+\\[==\\[([^]]+)\\]==\\]\\)" + "add_test( [==[\\1]==] nixGL -- \\2 [==[\\3]==])" + content "${content}") + + # add environment + # string(REGEX REPLACE + # "set_tests_properties\\([ \t\r\n]*\\[==\\[([^]]+)\\]==\\][ \t\r\n]+PROPERTIES[ \t\r\n]+([^)]+)\\)" + # "set_tests_properties( [==[\\1]==] PROPERTIES \\2 ENVIRONMENT \"NIXPKGS_ALLOW_UNFREE=1\")" + # content "${content}") + + file(WRITE "${ctest_tests_file}" "${content}") +endforeach() diff --git a/lib/realm-backend/test/src/test_update.cc b/lib/realm-backend/test/src/test_update.cc new file mode 100644 index 0000000000..1023399c8a --- /dev/null +++ b/lib/realm-backend/test/src/test_update.cc @@ -0,0 +1,120 @@ +#include "kernels/managed_ff_stream.h" +#include "kernels/managed_per_device_ff_handle.h" +#include "local-execution/allocated_tensors.h" +#include "pcg/computation_graph.h" +#include "pcg/computation_graph_builder.h" +#include "pcg/optimizer_attrs.dtg.h" +#include "realm-backend/driver.h" +#include "realm-backend/realm_allocator.h" +#include "realm-backend/realm_training_backing.h" +#include "test_utils.h" + +using namespace ::FlexFlow; +using namespace Realm; + +void top_level_task(const void *args, size_t arglen, const void *userdata, + size_t userlen, Realm::Processor p) { + // initialize runtime configs + ManagedFFStream managed_stream{}; + ManagedPerDeviceFFHandle managed_handle{}; + std::vector worker_procs; + std::vector allocators; + Machine::ProcessorQuery pq = Machine::ProcessorQuery(Machine::get_machine()) + .only_kind(Processor::TOC_PROC); + for (Processor p : pq) { + worker_procs.push_back(p); + allocators.push_back(create_realm_memory_allocator(p)); + } + + AllocatedTensors allocated_tensors = make_empty_allocated_tensors(); + + // construct computation graph + ComputationGraph computation_graph = make_empty_computation_graph(); + + nonnegative_int batch_size = 10_n; + nonnegative_int data_dim = 16_n; + nonnegative_int output_dim = 32_n; + + TensorShape input_tensor_shape = + TensorShape{TensorDims{FFOrdered{batch_size, data_dim}}, + DataType::FLOAT}; + + TensorShape weight_shape = + TensorShape{TensorDims{FFOrdered{data_dim, output_dim}}, + DataType::FLOAT}; + + LayerAddedResult inputs_layer = + add_input_layer(computation_graph, input_tensor_shape); + + LayerAddedResult weights_layer = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{ + weight_shape, InitializerAttrs{ZeroInitializerAttrs{}}}}, + "weights"}, + {}, {}); + + LayerAddedResult linear_operator = + add_layer(computation_graph, + LayerAttrs{ComputationGraphOpAttrs{ + LinearAttrs{output_dim, + /*use_bias=*/false, DataType::FLOAT, + Activation::RELU, std::nullopt}}, + "linear"}, + inputs_layer.outputs, weights_layer.outputs); + + RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ + DeviceSpecific::create(managed_handle.raw_handle()), + EnableProfiling::YES, + ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}}; + + GradientTensorSource gradient_tensor_source; + OptimizerTensorSource optimizer_tensor_source; + + int test_id = 0; + + { + printf("Running test %d: SGDOptimizerAttrs, momentum=0\n", ++test_id); + OptimizerAttrs optimizer_attrs = + OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, + /*momentum=*/0.0f, + /*nesterov=*/false, + /*weight_decay=*/0.001}}; + RealmTrainingBacking realm_training_backing = RealmTrainingBacking( + p, worker_procs, allocators, allocated_tensors, gradient_tensor_source, + optimizer_tensor_source, computation_graph, runtime_arg_config, + optimizer_attrs); + execute_update(realm_training_backing, linear_operator.layer, optimizer_attrs); + } + + { + printf("Running test %d: SGDOptimizerAttrs, momentum=0.9\n", ++test_id); + OptimizerAttrs optimizer_attrs = + OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, + /*momentum=*/0.9, + /*nesterov=*/false, + /*weight_decay=*/0.001}}; + RealmTrainingBacking realm_training_backing = RealmTrainingBacking( + p, worker_procs, allocators, allocated_tensors, gradient_tensor_source, + optimizer_tensor_source, computation_graph, runtime_arg_config, + optimizer_attrs); + execute_update(realm_training_backing, linear_operator.layer, optimizer_attrs); + } + + { + printf("Running test %d: AdamOptimizerAttrs\n", ++test_id); + OptimizerAttrs optimizer_attrs = + OptimizerAttrs{AdamOptimizerAttrs{/*alpha=*/0.001, + /*beta1=*/0.9, + /*beta2=*/0.999, + /*weight_decay=*/0.001, + /*alpha_t=*/0.001, + /*beta_t=*/0.9, + /*beta2_t=*/0.999, + /*epsilon=*/1e-8}}; + RealmTrainingBacking realm_training_backing = RealmTrainingBacking( + p, worker_procs, allocators, allocated_tensors, gradient_tensor_source, + optimizer_tensor_source, computation_graph, runtime_arg_config, + optimizer_attrs); + execute_update(realm_training_backing, linear_operator.layer, optimizer_attrs); + } +} diff --git a/lib/realm-backend/test/src/test_utils.cc b/lib/realm-backend/test/src/test_utils.cc new file mode 100644 index 0000000000..b7a4e16b97 --- /dev/null +++ b/lib/realm-backend/test/src/test_utils.cc @@ -0,0 +1,19 @@ +#include "test_utils.h" +#include "pcg/tensor_guid_t.dtg.h" + +namespace FlexFlow { + +PerDeviceFFHandle get_mock_per_device_ff_handle() { + return {nullptr, nullptr, nullptr, 0, false}; +} + +size_t MockTensorGuidSource::next_available_mock_tensor_guid = 0; + +MockTensorGuidSource::MockTensorGuidSource() {} + +tensor_guid_t MockTensorGuidSource::new_mock_tensor_guid() { + size_t next_guid = MockTensorGuidSource::next_available_mock_tensor_guid++; + return tensor_guid_t{DataflowOutput{Node{0}, nonnegative_int{next_guid}}}; +} + +} // namespace FlexFlow diff --git a/lib/realm-backend/test/src/test_utils.h b/lib/realm-backend/test/src/test_utils.h new file mode 100644 index 0000000000..056e92687c --- /dev/null +++ b/lib/realm-backend/test/src/test_utils.h @@ -0,0 +1,23 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_TEST_UTILS +#define _FLEXFLOW_LOCAL_EXECUTION_TEST_UTILS + +#include "kernels/ff_handle.h" +#include "pcg/tensor_guid_t.dtg.h" + +namespace FlexFlow { + +struct MockTensorGuidSource { +public: + MockTensorGuidSource(); + + tensor_guid_t new_mock_tensor_guid(); + +private: + static size_t next_available_mock_tensor_guid; +}; + +PerDeviceFFHandle get_mock_per_device_ff_handle(); + +} // namespace FlexFlow + +#endif From fa3f9173b1148d3a0ba5b163e1c405f6e3bc7f59 Mon Sep 17 00:00:00 2001 From: fruitea Date: Sun, 16 Mar 2025 09:59:53 -0700 Subject: [PATCH 60/91] fix: DeviceSpecificState error --- .../include/realm-backend/task_result.h | 28 +++++++++++---- lib/realm-backend/src/driver.cc | 1 + .../src/realm_training_backing.cc | 8 ++--- lib/realm-backend/src/task_result.cc | 35 ------------------- lib/realm-backend/src/task_wrapper.cc | 7 ++-- 5 files changed, 30 insertions(+), 49 deletions(-) diff --git a/lib/realm-backend/include/realm-backend/task_result.h b/lib/realm-backend/include/realm-backend/task_result.h index bac20ddd14..cebaf8ccb6 100644 --- a/lib/realm-backend/include/realm-backend/task_result.h +++ b/lib/realm-backend/include/realm-backend/task_result.h @@ -19,11 +19,25 @@ template struct SharedState { Realm::RegionInstance inst; SharedState() = delete; - SharedState(Realm::Memory); - void set_event(Realm::Event); - void set_value(T &&); - void wait(); - T get_value(); + SharedState(Realm::Memory mem) { + Realm::Rect<1> bounds(Realm::Point<1>(0), Realm::Point<1>(0)); + this->inst = Realm::RegionInstance::NO_INST; + Realm::RegionInstance::create_instance( + this->inst, mem, bounds, {sizeof(T)}, /*SOA*/ 1, + Realm::ProfilingRequestSet(), Realm::Event::NO_EVENT) + .wait(); + } + void set_event(Realm::Event e) { this->event = e; } + void set_value(T &&value) { + Realm::GenericAccessor acc(this->inst, 0); + acc[Realm::Point<1>(0)] = std::move(value); + } + void wait() { this->event.wait(); } + T get_value() { + wait(); + Realm::GenericAccessor acc(this->inst, 0); + return acc[Realm::Point<1>(0)]; + } }; // Specialization of SharedState for the `void` type, as it does not carry a @@ -33,8 +47,8 @@ template <> struct SharedState { Realm::Event event = Realm::Event::NO_EVENT; SharedState() = default; - void set_event(Realm::Event); - void wait(); + void set_event(Realm::Event e) { this->event = e; } + void wait() { this->event.wait(); } }; /** diff --git a/lib/realm-backend/src/driver.cc b/lib/realm-backend/src/driver.cc index 8cfb038d97..3f02bf7098 100644 --- a/lib/realm-backend/src/driver.cc +++ b/lib/realm-backend/src/driver.cc @@ -18,6 +18,7 @@ int main(int argc, const char **argv) { Processor p = Machine::ProcessorQuery(Machine::get_machine()) .only_kind(Processor::LOC_PROC) .first(); + assert(p.exists()); rt.shutdown(rt.collective_spawn(p, static_cast(task_id_t::TOP_LEVEL_TASK_ID), 0, 0)); return rt.wait_for_shutdown(); diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc index 17463ec4ec..7879d63231 100644 --- a/lib/realm-backend/src/realm_training_backing.cc +++ b/lib/realm-backend/src/realm_training_backing.cc @@ -149,16 +149,16 @@ initialize_args_backing(RealmTrainingBacking *backing, TaskImplFunction impl_function = task_registry.task_mapping.at(task_id).impl_function; // TODO: multi gpu launching - Promise promise(master_mem); - Future future = promise.get_future(); - RealmTaskArgs args{ + Promise> promise(master_mem); + Future> future = promise.get_future(); + RealmTaskArgs> args{ task_id, impl_function, accessor, std::move(promise)}; Event e = worker_procs[0].spawn(static_cast(task_id), &args, sizeof(args), worker_events[0]); worker_events[0] = e; future.set_event(e); - per_device_op_states.insert({node, std::move(future.get())}); + per_device_op_states.insert({node, std::move(future.get().value())}); } } diff --git a/lib/realm-backend/src/task_result.cc b/lib/realm-backend/src/task_result.cc index 05aa1a8a9c..e69de29bb2 100644 --- a/lib/realm-backend/src/task_result.cc +++ b/lib/realm-backend/src/task_result.cc @@ -1,35 +0,0 @@ -#include "realm-backend/task_result.h" - -namespace FlexFlow { - -/************ SharedState implementation ************/ -template SharedState::SharedState(Realm::Memory mem) { - Realm::Rect<1> bounds(Realm::Point<1>(0), Realm::Point<1>(0)); - this->inst = Realm::RegionInstance::NO_INST; - Realm::RegionInstance::create_instance( - this->inst, mem, bounds, {sizeof(T)}, /*SOA*/ 1, - Realm::ProfilingRequestSet(), Realm::Event::NO_EVENT) - .wait(); -} - -template void SharedState::set_event(Realm::Event e) { - this->event = e; -} - -template void SharedState::set_value(T &&value) { - Realm::GenericAccessor acc(this->inst, 0); - acc[Realm::Point<1>(0)] = std::move(value); -} - -template void SharedState::wait() { this->event.wait(); } - -template T SharedState::get_value() { - wait(); - Realm::GenericAccessor acc(this->inst, 0); - return acc[Realm::Point<1>(0)]; -} - -void SharedState::set_event(Realm::Event e) { this->event = e; } - -void SharedState::wait() { this->event.wait(); } -} // namespace FlexFlow \ No newline at end of file diff --git a/lib/realm-backend/src/task_wrapper.cc b/lib/realm-backend/src/task_wrapper.cc index ea36275462..ca5ff4f4fd 100644 --- a/lib/realm-backend/src/task_wrapper.cc +++ b/lib/realm-backend/src/task_wrapper.cc @@ -1,4 +1,5 @@ #include "realm-backend/task_wrapper.h" +#include namespace FlexFlow { @@ -6,12 +7,12 @@ using namespace Realm; void init_wrapper_task(const void *args, size_t arglen, const void *userdata, size_t userlen, Processor p) { - RealmTaskArgs const &task_args = - *reinterpret_cast *>(args); + RealmTaskArgs> const &task_args = + *reinterpret_cast> *>(args); auto fn = task_args.impl_function.get().function_ptr; DeviceSpecificDeviceStates result = fn(task_args.accessor); - task_args.promise.set_value(std::move(result)); + task_args.promise.set_value(std::make_optional(result)); } void fwdbwd_wrapper_task(const void *args, size_t arglen, const void *userdata, From b55aed70135e9e9e9422f2ac6736e267f404e9da Mon Sep 17 00:00:00 2001 From: fruitea Date: Wed, 19 Mar 2025 01:58:40 -0700 Subject: [PATCH 61/91] fix: realm task id should start from `Processor::TASK_ID_FIRST_AVAILABLE` --- .proj.toml | 5 +++++ .../include/realm-backend/driver.h | 2 ++ lib/realm-backend/src/driver.cc | 22 ++++++++++++------- .../src/realm_training_backing.cc | 10 ++++----- lib/realm-backend/src/task_wrapper.cc | 6 ++--- lib/realm-backend/test/CMakeLists.txt | 5 +---- 6 files changed, 30 insertions(+), 20 deletions(-) diff --git a/.proj.toml b/.proj.toml index a06fb53c3a..6b2909ef2a 100644 --- a/.proj.toml +++ b/.proj.toml @@ -43,6 +43,11 @@ type = "lib" tests = true benchmarks = false +[targets.realm-backend] +type = "lib" +tests = false +benchmarks = false + [targets.models] type = "lib" tests = true diff --git a/lib/realm-backend/include/realm-backend/driver.h b/lib/realm-backend/include/realm-backend/driver.h index 884b97a23d..d4b373099b 100644 --- a/lib/realm-backend/include/realm-backend/driver.h +++ b/lib/realm-backend/include/realm-backend/driver.h @@ -5,6 +5,8 @@ #include "realm/cmdline.h" #include "task-spec/op_task_invocation.h" +Realm::Processor::TaskFuncID get_realm_task_id(FlexFlow::task_id_t task_id); + void top_level_task(const void *args, size_t arglen, const void *userdata, size_t userlen, Realm::Processor p); diff --git a/lib/realm-backend/src/driver.cc b/lib/realm-backend/src/driver.cc index 3f02bf7098..e656836c10 100644 --- a/lib/realm-backend/src/driver.cc +++ b/lib/realm-backend/src/driver.cc @@ -3,16 +3,19 @@ using namespace Realm; using namespace FlexFlow; -Logger log_app("app"); +Processor::TaskFuncID get_realm_task_id(task_id_t task_id) { + return static_cast(task_id) + + Processor::TASK_ID_FIRST_AVAILABLE; +} -int main(int argc, const char **argv) { +int main(int argc, char **argv) { Runtime rt; - rt.init(&argc, (char ***)&argv); + rt.init(&argc, &argv); - Processor::register_task_by_kind(Processor::LOC_PROC, false /*!global*/, - static_cast(task_id_t::TOP_LEVEL_TASK_ID), - CodeDescriptor(top_level_task), - ProfilingRequestSet()) + Processor::register_task_by_kind( + Processor::LOC_PROC, false /*!global*/, + get_realm_task_id(task_id_t::TOP_LEVEL_TASK_ID), + CodeDescriptor(top_level_task), ProfilingRequestSet()) .external_wait(); Processor p = Machine::ProcessorQuery(Machine::get_machine()) @@ -20,6 +23,9 @@ int main(int argc, const char **argv) { .first(); assert(p.exists()); - rt.shutdown(rt.collective_spawn(p, static_cast(task_id_t::TOP_LEVEL_TASK_ID), 0, 0)); + Event e = rt.collective_spawn( + p, get_realm_task_id(task_id_t::TOP_LEVEL_TASK_ID), 0, 0); + rt.shutdown(e); + return rt.wait_for_shutdown(); } diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc index 7879d63231..d1a25b2788 100644 --- a/lib/realm-backend/src/realm_training_backing.cc +++ b/lib/realm-backend/src/realm_training_backing.cc @@ -154,7 +154,7 @@ initialize_args_backing(RealmTrainingBacking *backing, RealmTaskArgs> args{ task_id, impl_function, accessor, std::move(promise)}; Event e = - worker_procs[0].spawn(static_cast(task_id), + worker_procs[0].spawn(get_realm_task_id(task_id), &args, sizeof(args), worker_events[0]); worker_events[0] = e; future.set_event(e); @@ -202,7 +202,7 @@ execute_forward(RealmTrainingBacking &realm_training_backing, RealmTaskArgs args{task_id, impl_function, accessor, std::move(promise)}; Event e = realm_training_backing.worker_procs[0].spawn( - static_cast(task_id), &args, sizeof(args), + get_realm_task_id(task_id), &args, sizeof(args), realm_training_backing.worker_events[0]); realm_training_backing.worker_events[0] = e; future.set_event(e); @@ -249,7 +249,7 @@ execute_backward(RealmTrainingBacking &realm_training_backing, RealmTaskArgs args{task_id, impl_function, accessor, std::move(promise)}; Event e = realm_training_backing.worker_procs[0].spawn( - static_cast(task_id), &args, sizeof(args), + get_realm_task_id(task_id), &args, sizeof(args), realm_training_backing.worker_events[0]); realm_training_backing.worker_events[0] = e; future.set_event(e); @@ -299,7 +299,7 @@ Future execute_update(RealmTrainingBacking &realm_training_backing, RealmTaskArgs args{task_id, update_impl_fn, accessor, std::move(promise)}; Event e = realm_training_backing.worker_procs[0].spawn( - static_cast(task_id), &args, sizeof(args), + get_realm_task_id(task_id), &args, sizeof(args), realm_training_backing.worker_events[0]); realm_training_backing.worker_events[0] = e; future.set_event(e); @@ -334,7 +334,7 @@ Future compute_loss(RealmTrainingBacking &realm_training_backing, RealmTaskArgs args{task_id, loss_impl_fn, loss_accessor, std::move(promise)}; Event e = realm_training_backing.worker_procs[0].spawn( - static_cast(task_id), &args, sizeof(args), + get_realm_task_id(task_id), &args, sizeof(args), realm_training_backing.worker_events[0]); realm_training_backing.worker_events[0] = e; future.set_event(e); diff --git a/lib/realm-backend/src/task_wrapper.cc b/lib/realm-backend/src/task_wrapper.cc index ca5ff4f4fd..e628f40ad8 100644 --- a/lib/realm-backend/src/task_wrapper.cc +++ b/lib/realm-backend/src/task_wrapper.cc @@ -36,21 +36,21 @@ void generic_wrapper_task(const void *args, size_t arglen, const void *userdata, void register_wrapper_tasks_init(Processor p, task_id_t task_id) { Processor::register_task_by_kind( - p.kind(), false /*!global*/, static_cast(task_id), + p.kind(), false /*!global*/, get_realm_task_id(task_id), CodeDescriptor(init_wrapper_task), ProfilingRequestSet()) .external_wait(); } void register_wrapper_tasks_fwdbwd(Realm::Processor p, task_id_t task_id) { Processor::register_task_by_kind( - p.kind(), false /*!global*/, static_cast(task_id), + p.kind(), false /*!global*/, get_realm_task_id(task_id), CodeDescriptor(fwdbwd_wrapper_task), ProfilingRequestSet()) .external_wait(); } void register_wrapper_tasks_generic(Realm::Processor p, task_id_t task_id) { Processor::register_task_by_kind( - p.kind(), false /*!global*/, static_cast(task_id), + p.kind(), false /*!global*/, get_realm_task_id(task_id), CodeDescriptor(generic_wrapper_task), ProfilingRequestSet()) .external_wait(); } diff --git a/lib/realm-backend/test/CMakeLists.txt b/lib/realm-backend/test/CMakeLists.txt index e180208fbc..6658784d9e 100644 --- a/lib/realm-backend/test/CMakeLists.txt +++ b/lib/realm-backend/test/CMakeLists.txt @@ -1,4 +1,4 @@ -ff_add_test_executable( +ff_add_executable( NAME realm-backend-tests SRC_PATTERNS @@ -6,10 +6,7 @@ ff_add_test_executable( PRIVATE_INCLUDE src/ DEPS - utils-test-common realm-backend - kernels - op-attrs ) set(FF_TEST_EXEC_NAME "realm-backend-tests") From a921775f8b3916b103829adee9e9390ba7f74452 Mon Sep 17 00:00:00 2001 From: fruitea Date: Wed, 19 Mar 2025 05:24:46 -0700 Subject: [PATCH 62/91] fix: RealmTrainingBacking initialization --- .../realm-backend/realm_training_backing.h | 9 ++- .../src/realm_training_backing.cc | 74 ++++++++----------- lib/realm-backend/src/task_result.cc | 0 3 files changed, 36 insertions(+), 47 deletions(-) delete mode 100644 lib/realm-backend/src/task_result.cc diff --git a/lib/realm-backend/include/realm-backend/realm_training_backing.h b/lib/realm-backend/include/realm-backend/realm_training_backing.h index c695dc1a46..8fe842daf6 100644 --- a/lib/realm-backend/include/realm-backend/realm_training_backing.h +++ b/lib/realm-backend/include/realm-backend/realm_training_backing.h @@ -45,13 +45,16 @@ struct RealmTrainingBacking { std::vector worker_events; std::vector allocators; - RealmTensorBacking realm_tensor_backing; - RealmArgsBacking realm_args_backing; - ComputationGraph computation_graph; TaskRegistry task_registry; + + RealmTensorBacking realm_tensor_backing; + RealmArgsBacking realm_args_backing; }; +TaskRegistry construct_task_registry_and_register_tasks_for_realm( + ComputationGraph const &, std::vector const &); + RealmArgsBacking initialize_args_backing(RealmTrainingBacking *, ComputationGraph const &, RuntimeArgConfig const &); diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc index d1a25b2788..ee46105b31 100644 --- a/lib/realm-backend/src/realm_training_backing.cc +++ b/lib/realm-backend/src/realm_training_backing.cc @@ -30,40 +30,24 @@ RealmTrainingBacking::RealmTrainingBacking( GradientTensorSource &gradient_tensor_source, ComputationGraph const &computation_graph, RuntimeArgConfig const &runtime_arg_config) - : master_proc(master_proc), worker_procs(worker_procs), + : master_proc(master_proc), master_event(Realm::Event::NO_EVENT), + master_mem(Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::SYSTEM_MEM) + .best_affinity_to(master_proc) + .first()), + worker_procs(worker_procs), + worker_events(std::vector(worker_procs.size(), + Realm::Event::NO_EVENT)), allocators(allocators), computation_graph(computation_graph), - task_registry(construct_task_registry( - get_layer_attrs_mapping(computation_graph))), + task_registry(construct_task_registry_and_register_tasks_for_realm( + computation_graph, worker_procs)), realm_tensor_backing(construct_realm_tensor_backing( // TODO: multi gpu allocated_tensors, generate_unallocated_tensors( allocated_tensors, get_all_tensor_attrs(computation_graph), gradient_tensor_source), this->allocators[0])), - realm_args_backing(initialize_args_backing(this, computation_graph, runtime_arg_config)) { - master_event = Realm::Event::NO_EVENT; - master_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::SYSTEM_MEM) - .best_affinity_to(master_proc) - .first(); - for (Processor p : worker_procs) { - worker_events.push_back(Realm::Event::NO_EVENT); - } - - // register tasks for realm - std::unordered_map const &layer_attrs_mapping = - get_layer_attrs_mapping(this->computation_graph); - for (std::pair const &layer_attrs : - layer_attrs_mapping) { - ComputationGraphOpAttrs attrs = layer_attrs.second.op_attrs; - std::vector task_ids = get_task_ids(attrs); - for (task_id_t task_id : task_ids) { - TaskSignatureAndImpl task_signature_impl = get_task_sig_impl(task_id); - // TODO: multi gpu - register_wrapper_tasks(worker_procs[0], task_id, task_signature_impl); - } - } -} + realm_args_backing(initialize_args_backing(this, computation_graph, runtime_arg_config)) {} RealmTrainingBacking::RealmTrainingBacking( Processor master_proc, std::vector const &worker_procs, @@ -74,10 +58,17 @@ RealmTrainingBacking::RealmTrainingBacking( ComputationGraph const &computation_graph, RuntimeArgConfig const &runtime_arg_config, OptimizerAttrs const &optimizer_attrs) - : master_proc(master_proc), worker_procs(worker_procs), + : master_proc(master_proc), master_event(Realm::Event::NO_EVENT), + master_mem(Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::SYSTEM_MEM) + .best_affinity_to(master_proc) + .first()), + worker_procs(worker_procs), + worker_events(std::vector(worker_procs.size(), + Realm::Event::NO_EVENT)), allocators(allocators), computation_graph(computation_graph), - task_registry(construct_task_registry( - get_layer_attrs_mapping(computation_graph))), + task_registry(construct_task_registry_and_register_tasks_for_realm( + computation_graph, worker_procs)), realm_tensor_backing(construct_realm_tensor_backing( // TODO: multi gpu allocated_tensors, generate_unallocated_tensors_with_optimizer( @@ -85,19 +76,16 @@ RealmTrainingBacking::RealmTrainingBacking( gradient_tensor_source, optimizer_tensor_source, optimizer_attrs), this->allocators[0])), - realm_args_backing(initialize_args_backing(this, computation_graph, runtime_arg_config)) { - master_event = Realm::Event::NO_EVENT; - master_mem = Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::SYSTEM_MEM) - .best_affinity_to(master_proc) - .first(); - for (Processor p : worker_procs) { - worker_events.push_back(Realm::Event::NO_EVENT); - } + realm_args_backing(initialize_args_backing(this, computation_graph, runtime_arg_config)) {} + +TaskRegistry construct_task_registry_and_register_tasks_for_realm( + ComputationGraph const &cg, std::vector const &worker_procs) { + TaskRegistry task_registry = construct_task_registry( + get_layer_attrs_mapping(cg)); // register tasks for realm std::unordered_map const &layer_attrs_mapping = - get_layer_attrs_mapping(this->computation_graph); + get_layer_attrs_mapping(cg); for (std::pair const &layer_attrs : layer_attrs_mapping) { ComputationGraphOpAttrs attrs = layer_attrs.second.op_attrs; @@ -108,16 +96,14 @@ RealmTrainingBacking::RealmTrainingBacking( register_wrapper_tasks(worker_procs[0], task_id, task_signature_impl); } } + + return task_registry; } RealmArgsBacking initialize_args_backing(RealmTrainingBacking *backing, ComputationGraph const &cg, RuntimeArgConfig const &runtime_arg_config) { - // initialize_args_backing(TaskRegistry const &task_registry, - // ComputationGraph const &cg, - // RuntimeArgConfig const &runtime_arg_config, - // RealmTensorBacking const &realm_tensor_backing) { std::unordered_map per_device_op_states; TaskRegistry const &task_registry = backing->task_registry; diff --git a/lib/realm-backend/src/task_result.cc b/lib/realm-backend/src/task_result.cc deleted file mode 100644 index e69de29bb2..0000000000 From a708496e92216e53804e6aa9f82588996c779fc3 Mon Sep 17 00:00:00 2001 From: fruitea Date: Wed, 19 Mar 2025 10:31:30 -0700 Subject: [PATCH 63/91] fix: bugs with DeviceSpecificDeviceStates... --- .../include/realm-backend/task_result.h | 51 ++++++++++--------- .../include/realm-backend/task_wrapper.h | 2 +- lib/realm-backend/src/realm_args_backing.cc | 9 ---- lib/realm-backend/src/task_wrapper.cc | 14 ++++- 4 files changed, 41 insertions(+), 35 deletions(-) diff --git a/lib/realm-backend/include/realm-backend/task_result.h b/lib/realm-backend/include/realm-backend/task_result.h index cebaf8ccb6..19cd91f104 100644 --- a/lib/realm-backend/include/realm-backend/task_result.h +++ b/lib/realm-backend/include/realm-backend/task_result.h @@ -3,7 +3,7 @@ #include "realm-backend/driver.h" #include -#include +#include namespace FlexFlow { @@ -16,27 +16,30 @@ template struct SharedState { // synchronization primitives Realm::Event event = Realm::Event::NO_EVENT; // where the result is stored - Realm::RegionInstance inst; + Realm::RegionInstance inst = Realm::RegionInstance::NO_INST; - SharedState() = delete; + SharedState() = default; SharedState(Realm::Memory mem) { Realm::Rect<1> bounds(Realm::Point<1>(0), Realm::Point<1>(0)); - this->inst = Realm::RegionInstance::NO_INST; Realm::RegionInstance::create_instance( this->inst, mem, bounds, {sizeof(T)}, /*SOA*/ 1, Realm::ProfilingRequestSet(), Realm::Event::NO_EVENT) .wait(); } void set_event(Realm::Event e) { this->event = e; } - void set_value(T &&value) { + void set_value(T &&value) const { + assert(this->inst.exists()); Realm::GenericAccessor acc(this->inst, 0); acc[Realm::Point<1>(0)] = std::move(value); } void wait() { this->event.wait(); } T get_value() { wait(); + assert(this->inst.exists()); Realm::GenericAccessor acc(this->inst, 0); - return acc[Realm::Point<1>(0)]; + T value = acc[Realm::Point<1>(0)]; + this->inst.destroy(); + return value; } }; @@ -59,34 +62,34 @@ template <> struct SharedState { */ template class Future { public: - explicit Future(std::shared_ptr> state) - : state_(std::move(state)) {} + explicit Future(SharedState state) : state_(state) {} explicit Future() = default; explicit Future(T value) : value_(std::move(value)) {} - void set_event(Realm::Event e) { state_->set_event(e); } + void set_event(Realm::Event e) { state_.set_event(e); } T get() { - value_ = std::make_optional(state_->get_value()); + if (!value_.has_value()) { + value_ = std::make_optional(state_.get_value()); + } return value_.value(); } - void wait() { state_->wait(); } + void wait() { state_.wait(); } private: - std::shared_ptr> state_; - std::optional value_ = std::nullopt; + SharedState state_; + std::optional value_; }; // Specialization of Future for the `void` type, as it does not carry a value. template <> class Future { public: - explicit Future(std::shared_ptr> state) - : state_(std::move(state)) {} + explicit Future(SharedState state) : state_(state) {} explicit Future() = default; - void set_event(Realm::Event e) { state_->set_event(e); } - void get() { state_->wait(); } - void wait() { state_->wait(); } + void set_event(Realm::Event e) { state_.set_event(e); } + void get() { state_.wait(); } + void wait() { state_.wait(); } private: - std::shared_ptr> state_; + SharedState state_; }; /** @@ -97,22 +100,22 @@ template <> class Future { template class Promise { public: Promise() = delete; - Promise(Realm::Memory mem) : state_(std::make_shared>(mem)) {} + Promise(Realm::Memory mem) : state_(SharedState(mem)) {} Future get_future() { return Future(state_); } - void set_value(T &&value) const { state_->set_value(std::move(value)); } + void set_value(T &&value) const { state_.set_value(std::move(value)); } private: - std::shared_ptr> state_; + SharedState state_; }; // Specialization of Promise for the `void` type, as it does not carry a value. template <> class Promise { public: - Promise() : state_(std::make_shared>()) {} + Promise() : state_(SharedState()) {} Future get_future() { return Future(state_); } private: - std::shared_ptr> state_; + SharedState state_; }; } // namespace FlexFlow diff --git a/lib/realm-backend/include/realm-backend/task_wrapper.h b/lib/realm-backend/include/realm-backend/task_wrapper.h index 89521becf4..8265ca398b 100644 --- a/lib/realm-backend/include/realm-backend/task_wrapper.h +++ b/lib/realm-backend/include/realm-backend/task_wrapper.h @@ -31,7 +31,7 @@ void register_wrapper_tasks_fwdbwd(Realm::Processor p, task_id_t task_id); void register_wrapper_tasks_generic(Realm::Processor p, task_id_t task_id); -void register_wrapper_tasks(Realm::Processor p, task_id_t task_id, +void register_wrapper_tasks(int pid, Realm::Processor p, task_id_t task_id, TaskSignatureAndImpl task_sig_impl); } // namespace FlexFlow diff --git a/lib/realm-backend/src/realm_args_backing.cc b/lib/realm-backend/src/realm_args_backing.cc index e20fcdc14d..d30793a801 100644 --- a/lib/realm-backend/src/realm_args_backing.cc +++ b/lib/realm-backend/src/realm_args_backing.cc @@ -7,15 +7,6 @@ namespace FlexFlow { -// void RealmArgsBacking::add_per_device_op_state( -// layer_guid_t const &op_guid, Future &&future) -// { -// if (per_device_op_states.find(op_guid) != per_device_op_states.end()) { -// throw mk_runtime_error("Op state already exists"); -// } -// per_device_op_states.insert({op_guid, std::move(future)}); -// } - RealmArgsBacking make_args_backing_with_empty_device_states( RuntimeArgConfig const &runtime_arg_config) { return RealmArgsBacking{runtime_arg_config, {}}; diff --git a/lib/realm-backend/src/task_wrapper.cc b/lib/realm-backend/src/task_wrapper.cc index e628f40ad8..7894a90672 100644 --- a/lib/realm-backend/src/task_wrapper.cc +++ b/lib/realm-backend/src/task_wrapper.cc @@ -1,10 +1,16 @@ #include "realm-backend/task_wrapper.h" #include +#include namespace FlexFlow { using namespace Realm; +// After get device specific states from init task, storage a copy here to avoid auto destruction. +std::vector device_state_storage; + +std::unordered_set> registered_tasks; + void init_wrapper_task(const void *args, size_t arglen, const void *userdata, size_t userlen, Processor p) { RealmTaskArgs> const &task_args = @@ -12,6 +18,7 @@ void init_wrapper_task(const void *args, size_t arglen, const void *userdata, auto fn = task_args.impl_function.get().function_ptr; DeviceSpecificDeviceStates result = fn(task_args.accessor); + device_state_storage.push_back(result); task_args.promise.set_value(std::make_optional(result)); } @@ -55,8 +62,13 @@ void register_wrapper_tasks_generic(Realm::Processor p, task_id_t task_id) { .external_wait(); } -void register_wrapper_tasks(Processor p, task_id_t task_id, +void register_wrapper_tasks(int p_id, Processor p, task_id_t task_id, TaskSignatureAndImpl task_sig_impl) { + std::pair key = {p_id, task_id}; + if (registered_tasks.find(key) != registered_tasks.end()) { + return; + } + registered_tasks.insert(key); switch (task_sig_impl.task_signature.type) { case OpTaskType::INIT: register_wrapper_tasks_init(p, task_id); From 6e9c9af605fa73e82f190aca8de6c17a163a082f Mon Sep 17 00:00:00 2001 From: fruitea Date: Wed, 19 Mar 2025 10:39:17 -0700 Subject: [PATCH 64/91] tests: pass test_update --- .../include/realm-backend/task_result.h | 39 +++++++++++++++++++ .../src/realm_training_backing.cc | 10 ++--- lib/realm-backend/src/task_wrapper.cc | 9 ++--- lib/realm-backend/test/src/test_update.cc | 16 +++++--- 4 files changed, 57 insertions(+), 17 deletions(-) diff --git a/lib/realm-backend/include/realm-backend/task_result.h b/lib/realm-backend/include/realm-backend/task_result.h index 19cd91f104..d869982563 100644 --- a/lib/realm-backend/include/realm-backend/task_result.h +++ b/lib/realm-backend/include/realm-backend/task_result.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_LOCAL_EXECUTION_TASK_RESULT_H #include "realm-backend/driver.h" +#include "realm-backend/realm_task_argument_accessor.h" #include #include @@ -92,6 +93,24 @@ template <> class Future { SharedState state_; }; +template <> class Future { +public: + explicit Future( + std::shared_ptr> value) + : value_(value) {} + Future() = delete; + void set_event(Realm::Event e) { event_ = e; } + std::optional get() { + wait(); + return *value_; + } + void wait() { event_.wait(); } + +private: + Realm::Event event_; + std::shared_ptr> value_; +}; + /** * @brief Promise class template that allows setting a result in a SharedState * object. It is used to fulfill a Future with a value, and provides methods to @@ -118,6 +137,26 @@ template <> class Promise { SharedState state_; }; +// Specialization of Promise for the `DeviceSpecificDeviceStates` type. +// It has an inner shared_ptr value, so we need to find a way to avoid the value +// to deconstruct early. `shared_ptr` can work because DeveiceState will stored +// in the same node with the device that launch init task. Wrap a std::optional +// because we don't know the specific DeviceSpecificDeviceStates size. +template <> class Promise { +public: + Promise() + : value_(std::make_shared>()) {} + void set_value(DeviceSpecificDeviceStates value) const { + *value_ = std::make_optional(value); + } + Future get_future() { + return Future(value_); + } + +private: + std::shared_ptr> value_; +}; + } // namespace FlexFlow #endif \ No newline at end of file diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc index ee46105b31..3b7eb48823 100644 --- a/lib/realm-backend/src/realm_training_backing.cc +++ b/lib/realm-backend/src/realm_training_backing.cc @@ -93,7 +93,7 @@ TaskRegistry construct_task_registry_and_register_tasks_for_realm( for (task_id_t task_id : task_ids) { TaskSignatureAndImpl task_signature_impl = get_task_sig_impl(task_id); // TODO: multi gpu - register_wrapper_tasks(worker_procs[0], task_id, task_signature_impl); + register_wrapper_tasks(0, worker_procs[0], task_id, task_signature_impl); } } @@ -135,16 +135,16 @@ initialize_args_backing(RealmTrainingBacking *backing, TaskImplFunction impl_function = task_registry.task_mapping.at(task_id).impl_function; // TODO: multi gpu launching - Promise> promise(master_mem); - Future> future = promise.get_future(); - RealmTaskArgs> args{ + Promise promise = Promise(); + Future future = promise.get_future(); + RealmTaskArgs args{ task_id, impl_function, accessor, std::move(promise)}; Event e = worker_procs[0].spawn(get_realm_task_id(task_id), &args, sizeof(args), worker_events[0]); worker_events[0] = e; future.set_event(e); - per_device_op_states.insert({node, std::move(future.get().value())}); + per_device_op_states.insert({node, future.get().value()}); } } diff --git a/lib/realm-backend/src/task_wrapper.cc b/lib/realm-backend/src/task_wrapper.cc index 7894a90672..f07f11b60d 100644 --- a/lib/realm-backend/src/task_wrapper.cc +++ b/lib/realm-backend/src/task_wrapper.cc @@ -6,20 +6,17 @@ namespace FlexFlow { using namespace Realm; -// After get device specific states from init task, storage a copy here to avoid auto destruction. -std::vector device_state_storage; std::unordered_set> registered_tasks; void init_wrapper_task(const void *args, size_t arglen, const void *userdata, size_t userlen, Processor p) { - RealmTaskArgs> const &task_args = - *reinterpret_cast> *>(args); + RealmTaskArgs const &task_args = + *reinterpret_cast *>(args); auto fn = task_args.impl_function.get().function_ptr; DeviceSpecificDeviceStates result = fn(task_args.accessor); - device_state_storage.push_back(result); - task_args.promise.set_value(std::make_optional(result)); + task_args.promise.set_value(result); } void fwdbwd_wrapper_task(const void *args, size_t arglen, const void *userdata, diff --git a/lib/realm-backend/test/src/test_update.cc b/lib/realm-backend/test/src/test_update.cc index 1023399c8a..77462e2588 100644 --- a/lib/realm-backend/test/src/test_update.cc +++ b/lib/realm-backend/test/src/test_update.cc @@ -21,6 +21,7 @@ void top_level_task(const void *args, size_t arglen, const void *userdata, std::vector allocators; Machine::ProcessorQuery pq = Machine::ProcessorQuery(Machine::get_machine()) .only_kind(Processor::TOC_PROC); + assert(pq.count() > 0); for (Processor p : pq) { worker_procs.push_back(p); allocators.push_back(create_realm_memory_allocator(p)); @@ -73,7 +74,7 @@ void top_level_task(const void *args, size_t arglen, const void *userdata, int test_id = 0; { - printf("Running test %d: SGDOptimizerAttrs, momentum=0\n", ++test_id); + printf("Running test %d: SGDOptimizerAttrs, momentum=0...", ++test_id); OptimizerAttrs optimizer_attrs = OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, /*momentum=*/0.0f, @@ -83,11 +84,12 @@ void top_level_task(const void *args, size_t arglen, const void *userdata, p, worker_procs, allocators, allocated_tensors, gradient_tensor_source, optimizer_tensor_source, computation_graph, runtime_arg_config, optimizer_attrs); - execute_update(realm_training_backing, linear_operator.layer, optimizer_attrs); + execute_update(realm_training_backing, linear_operator.layer, optimizer_attrs).wait(); + printf("passed\n"); } { - printf("Running test %d: SGDOptimizerAttrs, momentum=0.9\n", ++test_id); + printf("Running test %d: SGDOptimizerAttrs, momentum=0.9...", ++test_id); OptimizerAttrs optimizer_attrs = OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, /*momentum=*/0.9, @@ -97,11 +99,12 @@ void top_level_task(const void *args, size_t arglen, const void *userdata, p, worker_procs, allocators, allocated_tensors, gradient_tensor_source, optimizer_tensor_source, computation_graph, runtime_arg_config, optimizer_attrs); - execute_update(realm_training_backing, linear_operator.layer, optimizer_attrs); + execute_update(realm_training_backing, linear_operator.layer, optimizer_attrs).wait(); + printf("passed\n"); } { - printf("Running test %d: AdamOptimizerAttrs\n", ++test_id); + printf("Running test %d: AdamOptimizerAttrs...", ++test_id); OptimizerAttrs optimizer_attrs = OptimizerAttrs{AdamOptimizerAttrs{/*alpha=*/0.001, /*beta1=*/0.9, @@ -115,6 +118,7 @@ void top_level_task(const void *args, size_t arglen, const void *userdata, p, worker_procs, allocators, allocated_tensors, gradient_tensor_source, optimizer_tensor_source, computation_graph, runtime_arg_config, optimizer_attrs); - execute_update(realm_training_backing, linear_operator.layer, optimizer_attrs); + execute_update(realm_training_backing, linear_operator.layer, optimizer_attrs).wait(); + printf("passed\n"); } } From 7b1f653198c63cba94d987213a72189ab0e882bf Mon Sep 17 00:00:00 2001 From: fruitea Date: Wed, 19 Mar 2025 10:41:08 -0700 Subject: [PATCH 65/91] chore: minor --- lib/realm-backend/test/src/test_update.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/realm-backend/test/src/test_update.cc b/lib/realm-backend/test/src/test_update.cc index 77462e2588..0b332d1ccc 100644 --- a/lib/realm-backend/test/src/test_update.cc +++ b/lib/realm-backend/test/src/test_update.cc @@ -74,7 +74,7 @@ void top_level_task(const void *args, size_t arglen, const void *userdata, int test_id = 0; { - printf("Running test %d: SGDOptimizerAttrs, momentum=0...", ++test_id); + printf("\nRunning test %d: SGDOptimizerAttrs, momentum=0...\n", ++test_id); OptimizerAttrs optimizer_attrs = OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, /*momentum=*/0.0f, @@ -89,7 +89,7 @@ void top_level_task(const void *args, size_t arglen, const void *userdata, } { - printf("Running test %d: SGDOptimizerAttrs, momentum=0.9...", ++test_id); + printf("\nRunning test %d: SGDOptimizerAttrs, momentum=0.9...\n", ++test_id); OptimizerAttrs optimizer_attrs = OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, /*momentum=*/0.9, @@ -104,7 +104,7 @@ void top_level_task(const void *args, size_t arglen, const void *userdata, } { - printf("Running test %d: AdamOptimizerAttrs...", ++test_id); + printf("\nRunning test %d: AdamOptimizerAttrs...\n", ++test_id); OptimizerAttrs optimizer_attrs = OptimizerAttrs{AdamOptimizerAttrs{/*alpha=*/0.001, /*beta1=*/0.9, From 64a82b3478cc4c1da841f84335d0f426f9eb0a2d Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 30 Apr 2025 11:15:55 -0700 Subject: [PATCH 66/91] Add e2e test --- .../local-execution/model_training_instance.h | 1 + .../src/model_training_instance.cc | 10 ++ lib/local-execution/test/src/test_e2e.cc | 140 ++++++++++++++++++ .../test/src/test_local_cost_estimator.cc | 4 +- .../test/src/test_loss_functions.cc | 2 +- .../test/src/test_task_registry.cc | 2 +- lib/local-execution/test/src/test_update.cc | 2 +- 7 files changed, 156 insertions(+), 5 deletions(-) create mode 100644 lib/local-execution/test/src/test_e2e.cc diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h index b36b20ed04..54b76313ab 100644 --- a/lib/local-execution/include/local-execution/model_training_instance.h +++ b/lib/local-execution/include/local-execution/model_training_instance.h @@ -30,6 +30,7 @@ struct ModelTrainingInstance { PerLayerElapsedTime forward(); PerLayerElapsedTime backward(); void update(); + GenericTensorAccessorW get_loss_tensor_backing(); }; } // namespace FlexFlow diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc index d404221d88..f232011230 100644 --- a/lib/local-execution/src/model_training_instance.cc +++ b/lib/local-execution/src/model_training_instance.cc @@ -54,4 +54,14 @@ void ModelTrainingInstance::update() { get_optimizer_attrs_for_next_iter(this->optimizer_attrs); } +GenericTensorAccessorW ModelTrainingInstance::get_loss_tensor_backing() { + gradient_tensor_t loss_tensor = + this->training_backing.local_tensor_backing + .tensor_gradient_mapping.at(this->logit_tensor); + GenericTensorAccessorW loss_tensor_backing = + this->training_backing.local_tensor_backing.tensor_backings.at( + TensorTypeVariant{loss_tensor}); + return loss_tensor_backing; +} + } // namespace FlexFlow diff --git a/lib/local-execution/test/src/test_e2e.cc b/lib/local-execution/test/src/test_e2e.cc new file mode 100644 index 0000000000..3f3a7ed0bc --- /dev/null +++ b/lib/local-execution/test/src/test_e2e.cc @@ -0,0 +1,140 @@ +#include "kernels/local_cuda_allocator.h" +#include "kernels/managed_ff_stream.h" +#include "kernels/managed_per_device_ff_handle.h" +#include "local-execution/allocated_tensors.h" +#include "local-execution/local_training_backing.h" +#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" +#include "pcg/computation_graph.h" +#include "pcg/computation_graph_builder.h" +#include "pcg/optimizer_attrs.dtg.h" +#include "test_utils.h" +#include "utils/containers/get_only.h" +#include "local-execution/model_training_instance.h" +#include + +using namespace ::FlexFlow; + +bool did_loss_decrease(GenericTensorAccessorW const &first_epoch, GenericTensorAccessorW const & last_epoch) { + float* first_epoch_ptr = first_epoch.get_float_ptr(); + float* last_epoch_ptr = last_epoch.get_float_ptr(); + + int batch_size = first_epoch.shape.at(ff_dim_t{nonnegative_int{0}}).unwrap_nonnegative(); + for (int i = 0; i < batch_size; i++) { + if (first_epoch_ptr[i] < last_epoch_ptr[i]) { + return false; + } + } + + return true; +} + + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("E2ETest") { + // initialize runtime + ManagedFFStream managed_stream{}; + ManagedPerDeviceFFHandle managed_handle{}; + + Allocator allocator = create_local_cuda_memory_allocator(); + + // allocate label tensors + LossTensorSource loss_tensor_source; + loss_tensor_t label_tensor = + loss_tensor_source.new_loss_tensor(); + + nonnegative_int batch_size = 10_n; + nonnegative_int data_dim = 16_n; + nonnegative_int output_dim = 32_n; + + TensorShape output_tensor_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, output_dim}}, + DataType::FLOAT}; + + GenericTensorAccessorW label_tensor_backing = + allocator.allocate_tensor(output_tensor_shape); + AllocatedTensors allocated_tensors = AllocatedTensors{ + { + {TensorTypeVariant{label_tensor}, + label_tensor_backing}}, + {}, + {}}; + + // construct computation graph + ComputationGraph computation_graph = make_empty_computation_graph(); + + TensorShape input_tensor_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, data_dim}}, + DataType::FLOAT}; + + TensorShape weight_shape = TensorShape{ + TensorDims{FFOrdered{data_dim, output_dim}}, + DataType::FLOAT}; + + LayerAddedResult inputs_layer = + add_input_layer(computation_graph, input_tensor_shape); + + LayerAddedResult weights_layer = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{ + weight_shape, InitializerAttrs{GlorotNormalAttrs{0}}}}, + std::nullopt}, + {}, + {}); + + LayerAddedResult linear_operator = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim, + /*use_bias=*/false, + DataType::FLOAT, + Activation::RELU, + std::nullopt}}, + std::nullopt}, + inputs_layer.outputs, + weights_layer.outputs); + tensor_guid_t logit_tensor = get_only(linear_operator.outputs); + + RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ + DeviceSpecific::create(managed_handle.raw_handle()), + EnableProfiling::YES, + ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}}; + + // initialize training backing + LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}; + OptimizerAttrs optimizer_attrs = + OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, + /*momentum=*/0.9, + /*nesterov=*/false, + /*weight_decay=*/0.001}}; + + + GradientTensorSource gradient_tensor_source; + OptimizerTensorSource optimizer_tensor_source; + + LocalTrainingBacking local_training_backing = + LocalTrainingBacking{allocator, + allocated_tensors, + gradient_tensor_source, + optimizer_tensor_source, + computation_graph, + runtime_arg_config, + optimizer_attrs}; + + // begin training loop + ModelTrainingInstance model_training_instance = ModelTrainingInstance{ + allocator, local_training_backing, logit_tensor, label_tensor, loss_attrs, optimizer_attrs + }; + + int num_epochs = 10; + std::vector loss_values (num_epochs); + + for (int i = 0; i < num_epochs; i++) { + model_training_instance.forward(); + model_training_instance.backward(); + model_training_instance.update(); + loss_values[i] = model_training_instance.get_loss_tensor_backing(); + } + + // Assert that each sample in the batch has a lower loss in last epoch than the first epoch + CHECK(did_loss_decrease(loss_values[0], loss_values[num_epochs - 1])); + } +} diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc index 30682c9a48..0fa841be20 100644 --- a/lib/local-execution/test/src/test_local_cost_estimator.cc +++ b/lib/local-execution/test/src/test_local_cost_estimator.cc @@ -9,8 +9,8 @@ using namespace ::FlexFlow; -TEST_SUITE(FF_CUDA_TEST_SUITE) { - TEST_CASE("Local Cost Estimator") { +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("LocalCostEstimator") { // local backing initialization ManagedPerDeviceFFHandle managed_handle{}; diff --git a/lib/local-execution/test/src/test_loss_functions.cc b/lib/local-execution/test/src/test_loss_functions.cc index 2bf138e204..ae76dcccf9 100644 --- a/lib/local-execution/test/src/test_loss_functions.cc +++ b/lib/local-execution/test/src/test_loss_functions.cc @@ -14,7 +14,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("Loss Functions") { + TEST_CASE("LossFunctions") { // initialize runtime ManagedFFStream managed_stream{}; ManagedPerDeviceFFHandle managed_handle{}; diff --git a/lib/local-execution/test/src/test_task_registry.cc b/lib/local-execution/test/src/test_task_registry.cc index dd4b6f5b44..16877b0e09 100644 --- a/lib/local-execution/test/src/test_task_registry.cc +++ b/lib/local-execution/test/src/test_task_registry.cc @@ -9,7 +9,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("Task Registry") { + TEST_CASE("TaskRegistry") { layer_guid_t layer_guid = layer_guid_t{Node{0}}; nonnegative_int embed_dim = 32_n; diff --git a/lib/local-execution/test/src/test_update.cc b/lib/local-execution/test/src/test_update.cc index 1f8684f38a..dcd9c025b3 100644 --- a/lib/local-execution/test/src/test_update.cc +++ b/lib/local-execution/test/src/test_update.cc @@ -12,7 +12,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("Execute Update") { + TEST_CASE("ExecuteUpdate") { // initialize runtime configs ManagedFFStream managed_stream{}; ManagedPerDeviceFFHandle managed_handle{}; From ffd96e2c4a341da4268384c79ffaef1eca889f70 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 30 Apr 2025 11:17:09 -0700 Subject: [PATCH 67/91] Format --- .../src/model_training_instance.cc | 4 +- lib/local-execution/test/src/test_e2e.cc | 61 ++++++++++--------- 2 files changed, 33 insertions(+), 32 deletions(-) diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc index f232011230..96a324b492 100644 --- a/lib/local-execution/src/model_training_instance.cc +++ b/lib/local-execution/src/model_training_instance.cc @@ -56,8 +56,8 @@ void ModelTrainingInstance::update() { GenericTensorAccessorW ModelTrainingInstance::get_loss_tensor_backing() { gradient_tensor_t loss_tensor = - this->training_backing.local_tensor_backing - .tensor_gradient_mapping.at(this->logit_tensor); + this->training_backing.local_tensor_backing.tensor_gradient_mapping.at( + this->logit_tensor); GenericTensorAccessorW loss_tensor_backing = this->training_backing.local_tensor_backing.tensor_backings.at( TensorTypeVariant{loss_tensor}); diff --git a/lib/local-execution/test/src/test_e2e.cc b/lib/local-execution/test/src/test_e2e.cc index 3f3a7ed0bc..33ffbe5f96 100644 --- a/lib/local-execution/test/src/test_e2e.cc +++ b/lib/local-execution/test/src/test_e2e.cc @@ -3,22 +3,24 @@ #include "kernels/managed_per_device_ff_handle.h" #include "local-execution/allocated_tensors.h" #include "local-execution/local_training_backing.h" +#include "local-execution/model_training_instance.h" #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" #include "pcg/computation_graph.h" #include "pcg/computation_graph_builder.h" #include "pcg/optimizer_attrs.dtg.h" #include "test_utils.h" #include "utils/containers/get_only.h" -#include "local-execution/model_training_instance.h" #include using namespace ::FlexFlow; -bool did_loss_decrease(GenericTensorAccessorW const &first_epoch, GenericTensorAccessorW const & last_epoch) { - float* first_epoch_ptr = first_epoch.get_float_ptr(); - float* last_epoch_ptr = last_epoch.get_float_ptr(); - - int batch_size = first_epoch.shape.at(ff_dim_t{nonnegative_int{0}}).unwrap_nonnegative(); +bool did_loss_decrease(GenericTensorAccessorW const &first_epoch, + GenericTensorAccessorW const &last_epoch) { + float *first_epoch_ptr = first_epoch.get_float_ptr(); + float *last_epoch_ptr = last_epoch.get_float_ptr(); + + int batch_size = + first_epoch.shape.at(ff_dim_t{nonnegative_int{0}}).unwrap_nonnegative(); for (int i = 0; i < batch_size; i++) { if (first_epoch_ptr[i] < last_epoch_ptr[i]) { return false; @@ -28,7 +30,6 @@ bool did_loss_decrease(GenericTensorAccessorW const &first_epoch, GenericTensorA return true; } - TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("E2ETest") { // initialize runtime @@ -39,8 +40,7 @@ TEST_SUITE(FF_TEST_SUITE) { // allocate label tensors LossTensorSource loss_tensor_source; - loss_tensor_t label_tensor = - loss_tensor_source.new_loss_tensor(); + loss_tensor_t label_tensor = loss_tensor_source.new_loss_tensor(); nonnegative_int batch_size = 10_n; nonnegative_int data_dim = 16_n; @@ -53,11 +53,7 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorW label_tensor_backing = allocator.allocate_tensor(output_tensor_shape); AllocatedTensors allocated_tensors = AllocatedTensors{ - { - {TensorTypeVariant{label_tensor}, - label_tensor_backing}}, - {}, - {}}; + {{TensorTypeVariant{label_tensor}, label_tensor_backing}}, {}, {}}; // construct computation graph ComputationGraph computation_graph = make_empty_computation_graph(); @@ -99,33 +95,37 @@ TEST_SUITE(FF_TEST_SUITE) { ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}}; // initialize training backing - LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}; + LossAttrs loss_attrs = LossAttrs{ + NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}; OptimizerAttrs optimizer_attrs = OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, /*momentum=*/0.9, /*nesterov=*/false, /*weight_decay=*/0.001}}; - GradientTensorSource gradient_tensor_source; OptimizerTensorSource optimizer_tensor_source; LocalTrainingBacking local_training_backing = LocalTrainingBacking{allocator, - allocated_tensors, - gradient_tensor_source, - optimizer_tensor_source, - computation_graph, - runtime_arg_config, - optimizer_attrs}; - - // begin training loop - ModelTrainingInstance model_training_instance = ModelTrainingInstance{ - allocator, local_training_backing, logit_tensor, label_tensor, loss_attrs, optimizer_attrs - }; + allocated_tensors, + gradient_tensor_source, + optimizer_tensor_source, + computation_graph, + runtime_arg_config, + optimizer_attrs}; + + // begin training loop + ModelTrainingInstance model_training_instance = + ModelTrainingInstance{allocator, + local_training_backing, + logit_tensor, + label_tensor, + loss_attrs, + optimizer_attrs}; int num_epochs = 10; - std::vector loss_values (num_epochs); + std::vector loss_values(num_epochs); for (int i = 0; i < num_epochs; i++) { model_training_instance.forward(); @@ -133,8 +133,9 @@ TEST_SUITE(FF_TEST_SUITE) { model_training_instance.update(); loss_values[i] = model_training_instance.get_loss_tensor_backing(); } - - // Assert that each sample in the batch has a lower loss in last epoch than the first epoch + + // Assert that each sample in the batch has a lower loss in last epoch than + // the first epoch CHECK(did_loss_decrease(loss_values[0], loss_values[num_epochs - 1])); } } From 2f75451059455612aa716eb53e38c888396ca85a Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 30 Apr 2025 15:32:05 -0700 Subject: [PATCH 68/91] Pass cost estimator test --- .../include/local-execution/task_argument_accessor.h | 8 +++++++- lib/local-execution/src/local_cost_estimator.cc | 12 ++++++------ lib/local-execution/src/local_training_backing.cc | 7 ++++--- .../test/src/test_local_cost_estimator.cc | 2 +- .../src/per_device_op_state.cc} | 0 5 files changed, 18 insertions(+), 11 deletions(-) rename lib/{local-execution/src/per_device_state.cc => task-spec/src/per_device_op_state.cc} (100%) diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h index 99c1c1296b..285b41991a 100644 --- a/lib/local-execution/include/local-execution/task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/task_argument_accessor.h @@ -14,7 +14,13 @@ struct TaskArgumentAccessor { if constexpr (PerDeviceOpState::IsPartOfPerDeviceOpState_v) { PerDeviceOpState device_states = this->ptr->get_concrete_arg(slot).get(); - return device_states.get(); + if (device_states.has()) { + return device_states.get(); + } else { + throw mk_runtime_error( + fmt::format("Invalid access to PerDeviceOpState attempted, instead it holds: ", + device_states.index())); + } } else { return this->ptr->get_concrete_arg(slot).get(); } diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc index 532fcc91c2..0ee6c9a987 100644 --- a/lib/local-execution/src/local_cost_estimator.cc +++ b/lib/local-execution/src/local_cost_estimator.cc @@ -90,12 +90,12 @@ CostDetails LocalCostEstimator::estimate_cost( computation_graph, this->runtime_arg_config); // execute layer - layer_guid_t operator_layer_guid = - get_layer_by_name(computation_graph, "operator"); - float fwd = - execute_forward(local_backing, operator_layer_guid, allocator).value(); - float bwd = - execute_backward(local_backing, operator_layer_guid, allocator).value(); + layer_guid_t operator_layer_guid = get_layer_by_name(computation_graph, "operator"); + + float fwd = execute_forward(local_backing, operator_layer_guid, allocator).value(); + std::cout << "completed forward" << std::endl; + float bwd = execute_backward(local_backing, operator_layer_guid, allocator).value(); + std::cout << "completed backward" << std::endl; float total_execution_time = fwd + bwd; diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index b2e0a2fb7e..7d916715f5 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -104,8 +104,7 @@ std::optional call_task_impl(TaskRegistry const &task_registry, task_id_t const &task_id, TaskArgumentAccessor const &acc) { TaskSignatureAndImpl task_sig_impl = task_registry.task_mapping.at(task_id); - auto fn = - task_sig_impl.impl_function.get().function_ptr; + auto fn = task_sig_impl.impl_function.get().function_ptr; return fn(acc); } @@ -116,13 +115,15 @@ std::optional if (registry_contains_task_for_layer(local_training_backing.task_registry, operator_node, OpTaskType::FWD)) { + ComputationGraphOpAttrs attrs = get_layer_attrs(local_training_backing.computation_graph, operator_node) .op_attrs; - + std::optional device_state = get_per_device_op_state_if_exists( local_training_backing.local_args_backing, operator_node); + TaskInvocation invocation = lower_to_task_invocation( forward(attrs), operator_node, diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc index 0fa841be20..e493265f86 100644 --- a/lib/local-execution/test/src/test_local_cost_estimator.cc +++ b/lib/local-execution/test/src/test_local_cost_estimator.cc @@ -31,7 +31,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*kdim=*/embed_dim, /*vdim=*/embed_dim, /*dropout=*/0.0, - /*bias=*/true, + /*bias=*/false, /*add_bias_kv=*/false, /*add_zero_attn=*/false, }; diff --git a/lib/local-execution/src/per_device_state.cc b/lib/task-spec/src/per_device_op_state.cc similarity index 100% rename from lib/local-execution/src/per_device_state.cc rename to lib/task-spec/src/per_device_op_state.cc From 2746e141ae3dd9f4fac8f03ddfaea9fb781b5b44 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Mon, 5 May 2025 05:33:42 -0700 Subject: [PATCH 69/91] Add nccl fix and host accessor access --- lib/kernels/include/kernels/accessor.h | 1 + .../kernels/managed_per_device_ff_handle.h | 6 +- lib/kernels/src/accessor.cc | 10 +++ .../src/managed_per_device_ff_handle.cc | 22 +++++- lib/kernels/test/src/test_attention_kernel.cc | 2 +- .../test/src/test_batch_matmul_kernel.cc | 2 +- .../test/src/test_batch_norm_kernel.cc | 2 +- lib/kernels/test/src/test_combine_kernel.cc | 2 +- lib/kernels/test/src/test_concat_kernel.cc | 2 +- lib/kernels/test/src/test_dropout.cc | 2 +- lib/kernels/test/src/test_flat_kernel.cc | 2 +- lib/kernels/test/src/test_gather_kernels.cc | 2 +- .../test/src/test_layer_norm_kernels.cc | 2 +- lib/kernels/test/src/test_partition_kernel.cc | 2 +- lib/kernels/test/src/test_pool_2d_kernels.cc | 2 +- lib/kernels/test/src/test_reduction_kernel.cc | 2 +- lib/kernels/test/src/test_replicate_kernel.cc | 2 +- lib/kernels/test/src/test_reshape_kernel.cc | 2 +- lib/kernels/test/src/test_reverse_kernels.cc | 2 +- lib/kernels/test/src/test_softmax_kernel.cc | 2 +- lib/kernels/test/src/test_split_kernel.cc | 2 +- lib/kernels/test/src/test_transpose_kernel.cc | 2 +- .../local-execution/model_training_instance.h | 2 +- .../local-execution/task_argument_accessor.h | 6 +- .../src/local-execution/ops/linear.cc | 6 +- .../src/local_cost_estimator.cc | 11 ++- .../src/local_training_backing.cc | 9 ++- .../src/model_training_instance.cc | 4 +- lib/local-execution/src/optimizer.cc | 2 +- lib/local-execution/test/src/test_e2e.cc | 75 +++++++++++++------ .../test/src/test_local_cost_estimator.cc | 2 +- .../test/src/test_loss_functions.cc | 2 +- lib/local-execution/test/src/test_update.cc | 2 +- lib/pcg/include/pcg/computation_graph.h | 2 + lib/pcg/src/pcg/computation_graph.cc | 14 ++++ 35 files changed, 146 insertions(+), 66 deletions(-) diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index 39da65c3be..55b120b090 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -75,6 +75,7 @@ std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &); int32_t *get_int32_ptr(GenericTensorAccessorW const &); int64_t *get_int64_ptr(GenericTensorAccessorW const &); float *get_float_ptr(GenericTensorAccessorW const &); +void write_to_host_float_ptr(GenericTensorAccessorW const &, float *); double *get_double_ptr(GenericTensorAccessorW const &); half *get_half_ptr(GenericTensorAccessorW const &); std::vector diff --git a/lib/kernels/include/kernels/managed_per_device_ff_handle.h b/lib/kernels/include/kernels/managed_per_device_ff_handle.h index 0a83a5eecb..05e8406de8 100644 --- a/lib/kernels/include/kernels/managed_per_device_ff_handle.h +++ b/lib/kernels/include/kernels/managed_per_device_ff_handle.h @@ -7,7 +7,7 @@ namespace FlexFlow { struct ManagedPerDeviceFFHandle { public: - ManagedPerDeviceFFHandle(); + ManagedPerDeviceFFHandle(int num_ranks, int my_rank); ManagedPerDeviceFFHandle(ManagedPerDeviceFFHandle const &) = delete; ManagedPerDeviceFFHandle & @@ -25,6 +25,10 @@ struct ManagedPerDeviceFFHandle { PerDeviceFFHandle *handle; }; +ManagedPerDeviceFFHandle initialize_single_gpu_handle(); +ManagedPerDeviceFFHandle initialize_multi_gpu_handle(int num_ranks, + int my_rank); + } // namespace FlexFlow #endif diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc index 27b7eb390d..7f4f61c271 100644 --- a/lib/kernels/src/accessor.cc +++ b/lib/kernels/src/accessor.cc @@ -1,4 +1,5 @@ #include "kernels/accessor.h" +#include "device.h" namespace FlexFlow { @@ -76,6 +77,15 @@ float *get_float_ptr(GenericTensorAccessorW const &a) { return get(a); } +void write_to_host_float_ptr(GenericTensorAccessorW const &a, float *host_ptr) { + float *device_ptr = get(a); + int total_elements = get_volume(a.shape).unwrap_nonnegative(); + checkCUDA(cudaMemcpy(host_ptr, + device_ptr, + total_elements * sizeof(float), + cudaMemcpyDeviceToHost)); +} + double *get_double_ptr(GenericTensorAccessorW const &a) { return get(a); } diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc index c050e887b6..e327a7b1e1 100644 --- a/lib/kernels/src/managed_per_device_ff_handle.cc +++ b/lib/kernels/src/managed_per_device_ff_handle.cc @@ -1,9 +1,10 @@ #include "kernels/managed_per_device_ff_handle.h" #include "device.h" +#include "kernels/nccl.h" namespace FlexFlow { -ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle() { +ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle(int num_ranks, int my_rank) { handle = new PerDeviceFFHandle; handle->workSpaceSize = 1024 * 1024; handle->allowTensorOpMathConversion = true; @@ -11,6 +12,13 @@ ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle() { checkCUDNN(cudnnCreate(&handle->dnn)); checkCUBLAS(cublasCreate(&handle->blas)); checkCUDA(cudaMalloc(&handle->workSpace, handle->workSpaceSize)); + +#ifdef FF_USE_NCCL + ncclUniqueId ncclId; + checkNCCL(ncclGetUniqueId(&ncclId)); + checkNCCL(ncclCommInitRank( + &handle->ncclComm, num_ranks, ncclId, my_rank)); // todo generalize +#endif } ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( @@ -28,6 +36,9 @@ ManagedPerDeviceFFHandle::~ManagedPerDeviceFFHandle() { checkCUDNN(cudnnDestroy(handle->dnn)); checkCUBLAS(cublasDestroy(handle->blas)); checkCUDA(cudaFree(handle->workSpace)); +#ifdef FF_USE_NCCL + checkNCCL(ncclCommDestroy(handle->ncclComm)); +#endif delete handle; } } @@ -36,4 +47,13 @@ PerDeviceFFHandle const &ManagedPerDeviceFFHandle::raw_handle() const { return *handle; } +ManagedPerDeviceFFHandle initialize_single_gpu_handle() { + return ManagedPerDeviceFFHandle(1, 0); +} + +ManagedPerDeviceFFHandle initialize_multi_gpu_handle(int num_ranks, + int my_rank) { + return ManagedPerDeviceFFHandle(num_ranks, my_rank); +} + } // namespace FlexFlow diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc index 64264f6c39..a15497984c 100644 --- a/lib/kernels/test/src/test_attention_kernel.cc +++ b/lib/kernels/test/src/test_attention_kernel.cc @@ -19,7 +19,7 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int kvSeqLength = 20_n; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc index cacd5b60fb..b9cfbf3ec5 100644 --- a/lib/kernels/test/src/test_batch_matmul_kernel.cc +++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc @@ -15,7 +15,7 @@ TEST_SUITE(FF_TEST_SUITE) { int seq_length = -1; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index b4c43cf1d8..94ce268b93 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -12,7 +12,7 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int output_w = 10_n; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc index 2b6b9bf589..68f35cb099 100644 --- a/lib/kernels/test/src/test_combine_kernel.cc +++ b/lib/kernels/test/src/test_combine_kernel.cc @@ -5,7 +5,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test combine kernel") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc index 215e599716..ca6b95dadc 100644 --- a/lib/kernels/test/src/test_concat_kernel.cc +++ b/lib/kernels/test/src/test_concat_kernel.cc @@ -10,7 +10,7 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int size_per_input = 100_n; ff_dim_t concat_axis = ff_dim_t{0_n}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); ManagedFFStream managed_stream{}; TensorShape input_shape = diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index 86f8f2102b..7e78544df8 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -18,7 +18,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape output_shape = input_shape; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc index 83f7f0445e..c9e1778843 100644 --- a/lib/kernels/test/src/test_flat_kernel.cc +++ b/lib/kernels/test/src/test_flat_kernel.cc @@ -7,7 +7,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Flat Kernel") { Allocator allocator = create_local_cuda_memory_allocator(); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); ManagedFFStream managed_stream{}; TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc index 1a8cf5f82a..ffe8e0dfd2 100644 --- a/lib/kernels/test/src/test_gather_kernels.cc +++ b/lib/kernels/test/src/test_gather_kernels.cc @@ -5,7 +5,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Gather Forward and Backward Kernel") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc index 5386c1d943..9e89c86433 100644 --- a/lib/kernels/test/src/test_layer_norm_kernels.cc +++ b/lib/kernels/test/src/test_layer_norm_kernels.cc @@ -17,7 +17,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape feature_shape = make_float_tensor_shape_from_legion_dims({feature_size}); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc index 4fd1b53210..281a146a30 100644 --- a/lib/kernels/test/src/test_partition_kernel.cc +++ b/lib/kernels/test/src/test_partition_kernel.cc @@ -6,7 +6,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Partition Forward and Backward") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index 62b61707c6..874e2b8d98 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -22,7 +22,7 @@ TEST_SUITE(FF_TEST_SUITE) { PoolOp pool_type = PoolOp::MAX; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc index 04a3817b84..7f993c12d3 100644 --- a/lib/kernels/test/src/test_reduction_kernel.cc +++ b/lib/kernels/test/src/test_reduction_kernel.cc @@ -10,7 +10,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = make_float_tensor_shape_from_legion_dims( {10_n, 10_n, 10_n, 10_n, 10_n}); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index fa726898f2..8c47c2a49a 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -10,7 +10,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); TensorShape output_shape = input_shape; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc index d329a347b3..1e969f6d82 100644 --- a/lib/kernels/test/src/test_reshape_kernel.cc +++ b/lib/kernels/test/src/test_reshape_kernel.cc @@ -5,7 +5,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Reshape Forward and Backward") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index 9c8475f6d6..ba808c491a 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -12,7 +12,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); TensorShape output_shape = input_shape; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index c9eaa76b86..cba293aed1 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -12,7 +12,7 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int input_w = 100_n; nonnegative_int channels = 100_n; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index ea0d280f68..65d1ed7783 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -12,7 +12,7 @@ TEST_SUITE(FF_TEST_SUITE) { coord_t in_blk_size = 100; coord_t num_blks = 1; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index 02d99c86a1..f7007d76e4 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -12,7 +12,7 @@ TEST_SUITE(FF_TEST_SUITE) { }, }; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h index 54b76313ab..2deed6b0a2 100644 --- a/lib/local-execution/include/local-execution/model_training_instance.h +++ b/lib/local-execution/include/local-execution/model_training_instance.h @@ -30,7 +30,7 @@ struct ModelTrainingInstance { PerLayerElapsedTime forward(); PerLayerElapsedTime backward(); void update(); - GenericTensorAccessorW get_loss_tensor_backing(); + void write_loss_tensor_to_host(float *host_ptr); }; } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h index 285b41991a..499b5ff7d6 100644 --- a/lib/local-execution/include/local-execution/task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/task_argument_accessor.h @@ -17,9 +17,9 @@ struct TaskArgumentAccessor { if (device_states.has()) { return device_states.get(); } else { - throw mk_runtime_error( - fmt::format("Invalid access to PerDeviceOpState attempted, instead it holds: ", - device_states.index())); + throw mk_runtime_error(fmt::format( + "Invalid access to PerDeviceOpState attempted, instead it holds: ", + device_states.index())); } } else { return this->ptr->get_concrete_arg(slot).get(); diff --git a/lib/local-execution/src/local-execution/ops/linear.cc b/lib/local-execution/src/local-execution/ops/linear.cc index 94f92d37ee..768293b32f 100644 --- a/lib/local-execution/src/local-execution/ops/linear.cc +++ b/lib/local-execution/src/local-execution/ops/linear.cc @@ -89,7 +89,6 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto weight = acc.get_tensor(WEIGHT); auto output = acc.get_tensor(OUTPUT); - auto bias = acc.get_tensor(BIAS); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); @@ -102,6 +101,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { float const *bias_ptr = NULL; if (attrs.use_bias) { + auto bias = acc.get_tensor(BIAS); bias_ptr = bias.get_float_ptr(); } @@ -118,14 +118,11 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { batch_size.unwrap_nonnegative()); } -; - static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto weight = acc.get_tensor(WEIGHT); auto output = acc.get_tensor(OUTPUT); - auto bias = acc.get_tensor(BIAS); auto input_grad = acc.get_tensor_grad(INPUT); auto weight_grad = acc.get_tensor_grad(WEIGHT); @@ -137,6 +134,7 @@ static std::optional float const *bias_ptr = NULL; if (attrs.use_bias) { + auto bias = acc.get_tensor(BIAS); bias_ptr = bias.get_float_ptr(); } diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc index 0ee6c9a987..0a84c19066 100644 --- a/lib/local-execution/src/local_cost_estimator.cc +++ b/lib/local-execution/src/local_cost_estimator.cc @@ -90,11 +90,14 @@ CostDetails LocalCostEstimator::estimate_cost( computation_graph, this->runtime_arg_config); // execute layer - layer_guid_t operator_layer_guid = get_layer_by_name(computation_graph, "operator"); - - float fwd = execute_forward(local_backing, operator_layer_guid, allocator).value(); + layer_guid_t operator_layer_guid = + get_layer_by_name(computation_graph, "operator"); + + float fwd = + execute_forward(local_backing, operator_layer_guid, allocator).value(); std::cout << "completed forward" << std::endl; - float bwd = execute_backward(local_backing, operator_layer_guid, allocator).value(); + float bwd = + execute_backward(local_backing, operator_layer_guid, allocator).value(); std::cout << "completed backward" << std::endl; float total_execution_time = fwd + bwd; diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index 7d916715f5..d508c34210 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -104,7 +104,8 @@ std::optional call_task_impl(TaskRegistry const &task_registry, task_id_t const &task_id, TaskArgumentAccessor const &acc) { TaskSignatureAndImpl task_sig_impl = task_registry.task_mapping.at(task_id); - auto fn = task_sig_impl.impl_function.get().function_ptr; + auto fn = + task_sig_impl.impl_function.get().function_ptr; return fn(acc); } @@ -115,15 +116,15 @@ std::optional if (registry_contains_task_for_layer(local_training_backing.task_registry, operator_node, OpTaskType::FWD)) { - + ComputationGraphOpAttrs attrs = get_layer_attrs(local_training_backing.computation_graph, operator_node) .op_attrs; - + std::optional device_state = get_per_device_op_state_if_exists( local_training_backing.local_args_backing, operator_node); - + TaskInvocation invocation = lower_to_task_invocation( forward(attrs), operator_node, diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc index 96a324b492..e58b5dfe7d 100644 --- a/lib/local-execution/src/model_training_instance.cc +++ b/lib/local-execution/src/model_training_instance.cc @@ -54,14 +54,14 @@ void ModelTrainingInstance::update() { get_optimizer_attrs_for_next_iter(this->optimizer_attrs); } -GenericTensorAccessorW ModelTrainingInstance::get_loss_tensor_backing() { +void ModelTrainingInstance::write_loss_tensor_to_host(float *host_ptr) { gradient_tensor_t loss_tensor = this->training_backing.local_tensor_backing.tensor_gradient_mapping.at( this->logit_tensor); GenericTensorAccessorW loss_tensor_backing = this->training_backing.local_tensor_backing.tensor_backings.at( TensorTypeVariant{loss_tensor}); - return loss_tensor_backing; + write_to_host_float_ptr(loss_tensor_backing, host_ptr); } } // namespace FlexFlow diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc index 1b9ce83d14..1b8fc37b2d 100644 --- a/lib/local-execution/src/optimizer.cc +++ b/lib/local-execution/src/optimizer.cc @@ -70,7 +70,7 @@ static void sgd_update_task_impl(TaskArgumentAccessor const &acc) { int size = weight_grad.shape.get_volume().unwrap_nonnegative(); assert(weight_grad.shape.get_volume().unwrap_nonnegative() & - weight.shape.get_volume().unwrap_nonnegative() == 0); + weight.shape.get_volume().unwrap_nonnegative()); int num_replicas = weight_grad.shape.get_volume().unwrap_nonnegative() / weight.shape.get_volume().unwrap_nonnegative(); diff --git a/lib/local-execution/test/src/test_e2e.cc b/lib/local-execution/test/src/test_e2e.cc index 33ffbe5f96..5791a94cbb 100644 --- a/lib/local-execution/test/src/test_e2e.cc +++ b/lib/local-execution/test/src/test_e2e.cc @@ -14,19 +14,12 @@ using namespace ::FlexFlow; -bool did_loss_decrease(GenericTensorAccessorW const &first_epoch, - GenericTensorAccessorW const &last_epoch) { - float *first_epoch_ptr = first_epoch.get_float_ptr(); - float *last_epoch_ptr = last_epoch.get_float_ptr(); - - int batch_size = - first_epoch.shape.at(ff_dim_t{nonnegative_int{0}}).unwrap_nonnegative(); +bool did_loss_decrease(float *first_epoch, float *last_epoch, int batch_size) { for (int i = 0; i < batch_size; i++) { - if (first_epoch_ptr[i] < last_epoch_ptr[i]) { + if (first_epoch[i] < last_epoch[i]) { return false; } } - return true; } @@ -34,7 +27,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("E2ETest") { // initialize runtime ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); Allocator allocator = create_local_cuda_memory_allocator(); @@ -44,7 +37,8 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int batch_size = 10_n; nonnegative_int data_dim = 16_n; - nonnegative_int output_dim = 32_n; + nonnegative_int hidden_dim = 32_n; + nonnegative_int output_dim = 1_n; TensorShape output_tensor_shape = TensorShape{ TensorDims{FFOrdered{batch_size, output_dim}}, @@ -62,32 +56,55 @@ TEST_SUITE(FF_TEST_SUITE) { TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; - TensorShape weight_shape = TensorShape{ - TensorDims{FFOrdered{data_dim, output_dim}}, + TensorShape weight_shape_1 = TensorShape{ + TensorDims{FFOrdered{data_dim, hidden_dim}}, + DataType::FLOAT}; + TensorShape weight_shape_2 = TensorShape{ + TensorDims{FFOrdered{hidden_dim, output_dim}}, DataType::FLOAT}; LayerAddedResult inputs_layer = - add_input_layer(computation_graph, input_tensor_shape); + add_input_layer_with_grad(computation_graph, input_tensor_shape); - LayerAddedResult weights_layer = add_layer( + LayerAddedResult weights_layer_1 = add_layer( computation_graph, LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{ - weight_shape, InitializerAttrs{GlorotNormalAttrs{0}}}}, + weight_shape_1, InitializerAttrs{GlorotNormalAttrs{0}}}}, std::nullopt}, {}, {}); - LayerAddedResult linear_operator = add_layer( + LayerAddedResult weights_layer_2 = add_layer( computation_graph, - LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim, + LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{ + weight_shape_2, InitializerAttrs{GlorotNormalAttrs{0}}}}, + std::nullopt}, + {}, + {}); + + LayerAddedResult linear_operator_1 = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{hidden_dim, /*use_bias=*/false, DataType::FLOAT, Activation::RELU, std::nullopt}}, std::nullopt}, inputs_layer.outputs, - weights_layer.outputs); - tensor_guid_t logit_tensor = get_only(linear_operator.outputs); + weights_layer_1.outputs); + + LayerAddedResult linear_operator_2 = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim, + /*use_bias=*/false, + DataType::FLOAT, + Activation::RELU, + std::nullopt}}, + std::nullopt}, + linear_operator_1.outputs, + weights_layer_2.outputs); + + tensor_guid_t logit_tensor = get_only(linear_operator_2.outputs); RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ DeviceSpecific::create(managed_handle.raw_handle()), @@ -124,18 +141,28 @@ TEST_SUITE(FF_TEST_SUITE) { loss_attrs, optimizer_attrs}; - int num_epochs = 10; - std::vector loss_values(num_epochs); + int num_epochs = 5; + int num_samples = batch_size.unwrap_nonnegative(); + std::vector loss_values(num_epochs); for (int i = 0; i < num_epochs; i++) { model_training_instance.forward(); model_training_instance.backward(); model_training_instance.update(); - loss_values[i] = model_training_instance.get_loss_tensor_backing(); + float *host_loss_ptr = new float[num_samples]; + model_training_instance.write_loss_tensor_to_host(host_loss_ptr); + loss_values[i] = host_loss_ptr; } // Assert that each sample in the batch has a lower loss in last epoch than // the first epoch - CHECK(did_loss_decrease(loss_values[0], loss_values[num_epochs - 1])); + float *first_epoch = loss_values[0]; + float *last_epoch = loss_values[num_epochs - 1]; + CHECK(did_loss_decrease( + first_epoch, last_epoch, batch_size.unwrap_nonnegative())); + + for (int i = 0; i < num_epochs; i++) { + delete[] loss_values[i]; + } } } diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc index e493265f86..c9c5afe04e 100644 --- a/lib/local-execution/test/src/test_local_cost_estimator.cc +++ b/lib/local-execution/test/src/test_local_cost_estimator.cc @@ -12,7 +12,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("LocalCostEstimator") { // local backing initialization - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ DeviceSpecific::create(managed_handle.raw_handle()), diff --git a/lib/local-execution/test/src/test_loss_functions.cc b/lib/local-execution/test/src/test_loss_functions.cc index ae76dcccf9..ca2482653b 100644 --- a/lib/local-execution/test/src/test_loss_functions.cc +++ b/lib/local-execution/test/src/test_loss_functions.cc @@ -17,7 +17,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("LossFunctions") { // initialize runtime ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/local-execution/test/src/test_update.cc b/lib/local-execution/test/src/test_update.cc index dcd9c025b3..75ba517d1b 100644 --- a/lib/local-execution/test/src/test_update.cc +++ b/lib/local-execution/test/src/test_update.cc @@ -15,7 +15,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("ExecuteUpdate") { // initialize runtime configs ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); Allocator allocator = create_local_cuda_memory_allocator(); AllocatedTensors allocated_tensors = make_empty_allocated_tensors(); diff --git a/lib/pcg/include/pcg/computation_graph.h b/lib/pcg/include/pcg/computation_graph.h index efc955ec92..60e825c11a 100644 --- a/lib/pcg/include/pcg/computation_graph.h +++ b/lib/pcg/include/pcg/computation_graph.h @@ -24,6 +24,8 @@ LayerAddedResult add_layer( LayerAddedResult add_input_layer(ComputationGraph &computation_graph, TensorShape const &tensor_shape); +LayerAddedResult add_input_layer_with_grad(ComputationGraph &computation_graph, + TensorShape const &tensor_shape); TensorAttrs get_tensor_attrs(ComputationGraph const &, tensor_guid_t const &); bool are_tensor_guid_shapes_equivalent(ComputationGraph const &cg, diff --git a/lib/pcg/src/pcg/computation_graph.cc b/lib/pcg/src/pcg/computation_graph.cc index 200410dd7b..b8917eed35 100644 --- a/lib/pcg/src/pcg/computation_graph.cc +++ b/lib/pcg/src/pcg/computation_graph.cc @@ -100,6 +100,20 @@ LayerAddedResult add_input_layer(ComputationGraph &cg, /*outputs=*/std::vector{CreateGrad::NO}); } +LayerAddedResult add_input_layer_with_grad(ComputationGraph &cg, + TensorShape const &tensor_shape) { + LayerAttrs layer_attrs = LayerAttrs{ + /*op_attrs=*/ComputationGraphOpAttrs{InputAttrs{tensor_shape}}, + /*name=*/std::nullopt, + }; + + return add_layer(cg, + layer_attrs, + /*inputs=*/{}, + /*weights=*/{}, + /*outputs=*/std::vector{CreateGrad::YES}); +} + TensorAttrs get_tensor_attrs(ComputationGraph const &cg, tensor_guid_t const &t) { return cg.raw_graph.at(t.raw_graph_output); From 31df7223cae1ef0d59ac2a0ba07444d0795d0c2f Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Thu, 8 May 2025 02:49:42 -0700 Subject: [PATCH 70/91] Move operators into task-spec --- .proj.toml | 9 +- .../include/kernels/optimizer_kernels.h | 3 - lib/kernels/src/accessor.cc | 202 ------------------ lib/kernels/src/allocation.cc | 21 -- lib/kernels/src/kernels/accessor.cc | 17 ++ .../src/managed_per_device_ff_handle.cc | 20 +- .../allocated_tensors.struct.toml | 1 - .../local_task_argument_accessor.h | 2 +- .../local_tensor_backing.struct.toml | 3 - .../include/local-execution/loss_functions.h | 2 +- .../local-execution/loss_tensor_source.h | 3 +- .../include/local-execution/optimizer.h | 2 +- .../local-execution/task_registry.struct.toml | 2 +- lib/local-execution/src/allocated_tensors.cc | 6 +- .../src/local_training_backing.cc | 2 +- lib/local-execution/src/loss_tensor_source.cc | 2 +- lib/local-execution/src/task_registry.cc | 2 +- lib/local-execution/test/CMakeLists.txt | 7 +- .../test/modify_test_commands.cmake | 21 -- lib/local-execution/test/src/test_e2e.cc | 11 +- .../test/src/test_local_task_arg_accessor.cc | 2 +- .../test/src/test_task_registry.cc | 2 +- lib/pcg/include/pcg/metric_attrs.h | 2 +- lib/pcg/src/pcg/metric_attrs.cc | 4 +- .../fwd_bwd_op_task_impl_function.h | 6 +- .../task-spec}/generic_task_impl_function.h | 6 +- .../task-spec}/init_op_task_impl_function.h | 6 +- .../task-spec}/itask_argument_accessor.h | 6 +- .../task-spec/loss_tensor_t.struct.toml | 5 +- .../include/task-spec}/ops/attention.h | 6 +- .../include/task-spec}/ops/batch_matmul.h | 6 +- .../include/task-spec}/ops/batch_norm.h | 2 +- .../include/task-spec}/ops/cast.h | 2 +- .../include/task-spec}/ops/combine.h | 2 +- .../include/task-spec}/ops/concat.h | 2 +- .../include/task-spec}/ops/conv_2d.h | 2 +- .../include/task-spec}/ops/dropout.h | 2 +- .../include/task-spec}/ops/element_binary.h | 8 +- .../include/task-spec}/ops/element_unary.h | 2 +- .../include/task-spec}/ops/embedding.h | 2 +- .../include/task-spec}/ops/flat.h | 2 +- .../include/task-spec}/ops/gather.h | 2 +- .../include/task-spec}/ops/input.h | 0 .../include/task-spec}/ops/layer_norm.h | 2 +- .../include/task-spec}/ops/linear.h | 2 +- .../include/task-spec}/ops/noop.h | 0 .../include/task-spec}/ops/parallel_op.h | 0 .../include/task-spec}/ops/pool_2d.h | 2 +- .../include/task-spec}/ops/reduce.h | 2 +- .../include/task-spec}/ops/reduction.h | 2 +- .../include/task-spec}/ops/repartition.h | 2 +- .../include/task-spec}/ops/replicate.h | 2 +- .../include/task-spec}/ops/reshape.h | 2 +- .../include/task-spec}/ops/reverse.h | 2 +- .../include/task-spec}/ops/softmax.h | 6 +- .../include/task-spec}/ops/split.h | 2 +- .../include/task-spec}/ops/topk.h | 2 +- .../include/task-spec}/ops/transpose.h | 2 +- .../include/task-spec}/ops/weight.h | 0 .../include/task-spec}/permissions.h | 4 +- .../task-spec}/privilege_tensor_accessor.h | 6 +- .../task-spec}/task_argument_accessor.h | 6 +- .../task_impl_function.variant.toml | 6 +- .../include/task-spec}/task_signature_impl.h | 6 +- .../task_signature_impl.struct.toml | 2 +- .../src/{ => task-spec}/concrete_arg.cc | 0 .../fwd_bwd_op_task_impl_function.cc | 2 +- .../task-spec}/generic_task_impl_function.cc | 2 +- .../task-spec}/init_op_task_impl_function.cc | 2 +- .../src/task-spec/itask_argument_accessor.cc | 1 + .../src/{ => task-spec}/op_arg_ref.cc | 0 .../src/{ => task-spec}/op_arg_spec.cc | 0 .../src/{ => task-spec}/op_task_invocation.cc | 0 .../src/{ => task-spec}/op_task_signature.cc | 0 .../op_task_to_task_invocation.cc | 0 .../src/{ => task-spec}/op_tensor_spec.cc | 0 .../src/task-spec}/ops/attention.cc | 2 +- .../src/task-spec}/ops/batch_matmul.cc | 2 +- .../src/task-spec}/ops/batch_norm.cc | 2 +- .../src/task-spec}/ops/cast.cc | 2 +- .../src/task-spec}/ops/combine.cc | 2 +- .../src/task-spec}/ops/concat.cc | 2 +- .../src/task-spec}/ops/conv_2d.cc | 2 +- .../src/task-spec}/ops/dropout.cc | 2 +- .../src/task-spec}/ops/element_binary.cc | 4 +- .../src/task-spec}/ops/element_unary.cc | 2 +- .../src/task-spec}/ops/flat.cc | 2 +- .../src/task-spec}/ops/gather.cc | 2 +- .../src/task-spec}/ops/input.cc | 2 +- .../src/task-spec}/ops/layer_norm.cc | 2 +- .../src/task-spec}/ops/linear.cc | 4 +- .../src/task-spec}/ops/noop.cc | 2 +- .../src/task-spec}/ops/pool_2d.cc | 3 +- .../src/task-spec}/ops/reduce.cc | 3 +- .../src/task-spec}/ops/reduction.cc | 2 +- .../src/task-spec}/ops/repartition.cc | 2 +- .../src/task-spec}/ops/replicate.cc | 2 +- .../src/task-spec}/ops/reshape.cc | 2 +- .../src/task-spec}/ops/reverse.cc | 2 +- .../src/task-spec}/ops/softmax.cc | 2 +- .../src/task-spec}/ops/split.cc | 2 +- .../src/task-spec}/ops/topk.cc | 2 +- .../src/task-spec}/ops/transpose.cc | 2 +- .../src/task-spec}/ops/weight.cc | 2 +- .../{ => task-spec}/per_device_op_state.cc | 0 .../src/task-spec}/permissions.cc | 2 +- .../task-spec/privilege_tensor_accessor.cc | 1 + .../src/{ => task-spec}/runtime_arg_ref.cc | 0 .../src/{ => task-spec}/task_arg_spec.cc | 0 .../src/task-spec/task_argument_accessor.cc | 1 + .../src/{ => task-spec}/task_invocation.cc | 0 .../src/{ => task-spec}/task_signature.cc | 0 .../src/task-spec}/task_signature_impl.cc | 60 +++--- .../{ => task-spec}/variadic_tensor_ref.cc | 0 lib/task-spec/test/CMakeLists.txt | 14 ++ lib/task-spec/test/src/task-spec/arg_ref.cc | 33 +++ 116 files changed, 247 insertions(+), 410 deletions(-) delete mode 100644 lib/kernels/src/accessor.cc delete mode 100644 lib/kernels/src/allocation.cc delete mode 100644 lib/local-execution/test/modify_test_commands.cmake rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/fwd_bwd_op_task_impl_function.h (79%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/generic_task_impl_function.h (80%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/init_op_task_impl_function.h (81%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/itask_argument_accessor.h (82%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/attention.h (79%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/batch_matmul.h (75%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/batch_norm.h (93%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/cast.h (95%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/combine.h (91%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/concat.h (91%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/conv_2d.h (92%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/dropout.h (93%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/element_binary.h (73%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/element_unary.h (93%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/embedding.h (91%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/flat.h (90%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/gather.h (92%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/input.h (100%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/layer_norm.h (93%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/linear.h (92%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/noop.h (100%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/parallel_op.h (100%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/pool_2d.h (92%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/reduce.h (93%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/reduction.h (92%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/repartition.h (93%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/replicate.h (91%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/reshape.h (92%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/reverse.h (91%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/softmax.h (78%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/split.h (90%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/topk.h (92%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/transpose.h (91%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/ops/weight.h (100%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/permissions.h (90%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/privilege_tensor_accessor.h (81%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/task_argument_accessor.h (96%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/task_impl_function.variant.toml (72%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/task_signature_impl.h (71%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/task_signature_impl.struct.toml (86%) rename lib/task-spec/src/{ => task-spec}/concrete_arg.cc (100%) rename lib/{local-execution/src => task-spec/src/task-spec}/fwd_bwd_op_task_impl_function.cc (96%) rename lib/{local-execution/src => task-spec/src/task-spec}/generic_task_impl_function.cc (96%) rename lib/{local-execution/src => task-spec/src/task-spec}/init_op_task_impl_function.cc (96%) create mode 100644 lib/task-spec/src/task-spec/itask_argument_accessor.cc rename lib/task-spec/src/{ => task-spec}/op_arg_ref.cc (100%) rename lib/task-spec/src/{ => task-spec}/op_arg_spec.cc (100%) rename lib/task-spec/src/{ => task-spec}/op_task_invocation.cc (100%) rename lib/task-spec/src/{ => task-spec}/op_task_signature.cc (100%) rename lib/task-spec/src/{ => task-spec}/op_task_to_task_invocation.cc (100%) rename lib/task-spec/src/{ => task-spec}/op_tensor_spec.cc (100%) rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/attention.cc (99%) rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/batch_matmul.cc (99%) rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/batch_norm.cc (99%) rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/cast.cc (98%) rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/combine.cc (98%) rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/concat.cc (98%) rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/conv_2d.cc (99%) rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/dropout.cc (99%) rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/element_binary.cc (98%) rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/element_unary.cc (99%) rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/flat.cc (98%) rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/gather.cc (99%) rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/input.cc (76%) rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/layer_norm.cc (99%) rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/linear.cc (98%) rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/noop.cc (95%) rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/pool_2d.cc (99%) rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/reduce.cc (99%) rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/reduction.cc (98%) rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/repartition.cc (99%) rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/replicate.cc (98%) rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/reshape.cc (99%) rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/reverse.cc (98%) rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/softmax.cc (99%) rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/split.cc (99%) rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/topk.cc (99%) rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/transpose.cc (98%) rename lib/{local-execution/src/local-execution => task-spec/src/task-spec}/ops/weight.cc (76%) rename lib/task-spec/src/{ => task-spec}/per_device_op_state.cc (100%) rename lib/{local-execution/src => task-spec/src/task-spec}/permissions.cc (97%) create mode 100644 lib/task-spec/src/task-spec/privilege_tensor_accessor.cc rename lib/task-spec/src/{ => task-spec}/runtime_arg_ref.cc (100%) rename lib/task-spec/src/{ => task-spec}/task_arg_spec.cc (100%) create mode 100644 lib/task-spec/src/task-spec/task_argument_accessor.cc rename lib/task-spec/src/{ => task-spec}/task_invocation.cc (100%) rename lib/task-spec/src/{ => task-spec}/task_signature.cc (100%) rename lib/{local-execution/src => task-spec/src/task-spec}/task_signature_impl.cc (93%) rename lib/task-spec/src/{ => task-spec}/variadic_tensor_ref.cc (100%) create mode 100644 lib/task-spec/test/CMakeLists.txt create mode 100644 lib/task-spec/test/src/task-spec/arg_ref.cc diff --git a/.proj.toml b/.proj.toml index 8eed6166cd..3a120ca553 100644 --- a/.proj.toml +++ b/.proj.toml @@ -56,13 +56,20 @@ has-cpu-only-benchmarks = false has-cuda-tests = false has-cuda-benchmarks = false -[targets.local-execution] +[targets.task_spec] type = "lib" has-cpu-only-tests = true has-cpu-only-benchmarks = false has-cuda-tests = false has-cuda-benchmarks = false +[targets.local-execution] +type = "lib" +has-cpu-only-tests = true +has-cpu-only-benchmarks = false +has-cuda-tests = true +has-cuda-benchmarks = false + [targets.models] type = "lib" has-cpu-only-tests = true diff --git a/lib/kernels/include/kernels/optimizer_kernels.h b/lib/kernels/include/kernels/optimizer_kernels.h index e5f8d243a1..39284b4a6f 100644 --- a/lib/kernels/include/kernels/optimizer_kernels.h +++ b/lib/kernels/include/kernels/optimizer_kernels.h @@ -36,8 +36,6 @@ void adam_ps_update_task_gpu(ffStream_t, float beta2, float weight_decay, float epsilon, - size_t size, - int num_replicas, float const *weight_grad_ptr, size_t size, int num_replicas, @@ -54,7 +52,6 @@ void adam_nccl_update_task_gpu(ffStream_t, size_t size, PerDeviceFFHandle const &, float const *weight_grad_ptr, - size_t size, float *weight_ptr, float *adam_v_ptr, float *adam_m_ptr); diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc deleted file mode 100644 index 7f4f61c271..0000000000 --- a/lib/kernels/src/accessor.cc +++ /dev/null @@ -1,202 +0,0 @@ -#include "kernels/accessor.h" -#include "device.h" - -namespace FlexFlow { - -int32_t *GenericTensorAccessorW::get_int32_ptr() const { - return this->get(); -} - -int64_t *GenericTensorAccessorW::get_int64_ptr() const { - return this->get(); -} - -float *GenericTensorAccessorW::get_float_ptr() const { - return this->get(); -} - -double *GenericTensorAccessorW::get_double_ptr() const { - return this->get(); -} - -half *GenericTensorAccessorW::get_half_ptr() const { - return this->get(); -} - -std::string format_as(GenericTensorAccessorW const &a) { - return fmt::format("", - a.data_type, - a.shape, - a.ptr); -} - -std::ostream &operator<<(std::ostream &s, GenericTensorAccessorW const &a) { - return (s << fmt::to_string(a)); -} - -int32_t const *GenericTensorAccessorR::get_int32_ptr() const { - return this->get(); -} - -int64_t const *GenericTensorAccessorR::get_int64_ptr() const { - return this->get(); -} - -float const *GenericTensorAccessorR::get_float_ptr() const { - return this->get(); -} - -double const *GenericTensorAccessorR::get_double_ptr() const { - return this->get(); -} - -half const *GenericTensorAccessorR::get_half_ptr() const { - return get(); -} - -std::string format_as(GenericTensorAccessorR const &a) { - return fmt::format("", - a.data_type, - a.shape, - a.ptr); -} - -std::ostream &operator<<(std::ostream &s, GenericTensorAccessorR const &a) { - return (s << fmt::to_string(a)); -} - -int32_t *get_int32_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -int64_t *get_int64_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -float *get_float_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -void write_to_host_float_ptr(GenericTensorAccessorW const &a, float *host_ptr) { - float *device_ptr = get(a); - int total_elements = get_volume(a.shape).unwrap_nonnegative(); - checkCUDA(cudaMemcpy(host_ptr, - device_ptr, - total_elements * sizeof(float), - cudaMemcpyDeviceToHost)); -} - -double *get_double_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -half *get_half_ptr(GenericTensorAccessorW const &a) { - return get(a); -} - -std::vector - get_int32_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_int64_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_float_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_double_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_half_ptrs(std::vector const &a) { - return get(a); -} - -int32_t const *get_int32_ptr(GenericTensorAccessorR const &a) { - return get(a); -} - -int64_t const *get_int64_ptr(GenericTensorAccessorR const &a) { - return get(a); -} - -float const *get_float_ptr(GenericTensorAccessorR const &a) { - return get(a); -} - -double const *get_double_ptr(GenericTensorAccessorR const &a) { - return get(a); -} - -half const *get_half_ptr(GenericTensorAccessorR const &a) { - return get(a); -} - -std::vector - get_int32_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_int64_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_float_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_double_ptrs(std::vector const &a) { - return get(a); -} - -std::vector - get_half_ptrs(std::vector const &a) { - return get(a); -} - -GenericTensorAccessorR read_only_accessor_from_write_accessor( - GenericTensorAccessorW const &writable) { - return GenericTensorAccessorR{ - writable.data_type, writable.shape, req(writable.ptr)}; -} - -bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1, - GenericTensorAccessorW const &acc2) { - return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type; -} - -bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor, - ArrayShape const &expected_shape, - DataType const &expected_dtype) { - return accessor.shape == expected_shape && - accessor.data_type == expected_dtype; -} - -bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, - ArrayShape const &expected_shape, - DataType const &expected_dtype) { - return accessor.shape == expected_shape && - accessor.data_type == expected_dtype; -} - -std::pair - get_shape_and_datatype(GenericTensorAccessorR const &accessor) { - return std::make_pair(accessor.shape, accessor.data_type); -} - -std::pair - get_shape_and_datatype(GenericTensorAccessorW const &accessor) { - return std::make_pair(accessor.shape, accessor.data_type); -} - -} // namespace FlexFlow diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc deleted file mode 100644 index 114f817215..0000000000 --- a/lib/kernels/src/allocation.cc +++ /dev/null @@ -1,21 +0,0 @@ -#include "kernels/allocation.h" -#include "op-attrs/tensor_shape.h" - -namespace FlexFlow { - -void *Allocator::allocate(size_t mem_size) { - return this->i_allocator->allocate(mem_size); -} - -void Allocator::deallocate(void *ptr) { - this->i_allocator->deallocate(ptr); -} - -GenericTensorAccessorW - Allocator::allocate_tensor(TensorShape const &tensor_shape) { - void *ptr = - this->allocate(get_size_in_bytes(tensor_shape).unwrap_nonnegative()); - return {tensor_shape.data_type, ArrayShape{tensor_shape}, ptr}; -} - -} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/accessor.cc b/lib/kernels/src/kernels/accessor.cc index b5042f77a0..409b7533f9 100644 --- a/lib/kernels/src/kernels/accessor.cc +++ b/lib/kernels/src/kernels/accessor.cc @@ -234,6 +234,11 @@ bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1, return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type; } +bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1, + GenericTensorAccessorW const &acc2) { + return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type; +} + bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, ArrayShape const &expected_shape, DataType const &expected_dtype) { @@ -241,9 +246,21 @@ bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, accessor.data_type == expected_dtype; } +bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor, + ArrayShape const &expected_shape, + DataType const &expected_dtype) { + return accessor.shape == expected_shape && + accessor.data_type == expected_dtype; +} + std::pair get_shape_and_datatype(GenericTensorAccessorR const &accessor) { return std::make_pair(accessor.shape, accessor.data_type); } +std::pair + get_shape_and_datatype(GenericTensorAccessorW const &accessor) { + return std::make_pair(accessor.shape, accessor.data_type); +} + } // namespace FlexFlow diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc index dc1303b8e0..7c619bb557 100644 --- a/lib/kernels/src/managed_per_device_ff_handle.cc +++ b/lib/kernels/src/managed_per_device_ff_handle.cc @@ -48,13 +48,25 @@ PerDeviceFFHandle const &ManagedPerDeviceFFHandle::raw_handle() const { return *handle; } -ManagedPerDeviceFFHandle initialize_single_gpu_handle() { - return ManagedPerDeviceFFHandle(1, 0); +ManagedPerDeviceFFHandle initialize_single_gpu_handle(size_t workSpaceSize, bool allowTensorOpMathConversion) { + return ManagedPerDeviceFFHandle{ + /*num_ranks=*/1, + /*my_rank=*/0, + /*workSpaceSize=*/workSpaceSize, + /*allowTensorOpMathConversion=*/allowTensorOpMathConversion, + }; } ManagedPerDeviceFFHandle initialize_multi_gpu_handle(int num_ranks, - int my_rank) { - return ManagedPerDeviceFFHandle(num_ranks, my_rank); + int my_rank, + size_t workSpaceSize, + bool allowTensorOpMathConversion) { + return ManagedPerDeviceFFHandle{ + /*num_ranks=*/num_ranks, + /*my_rank=*/my_rank, + /*workSpaceSize=*/workSpaceSize, + /*allowTensorOpMathConversion=*/allowTensorOpMathConversion, + }; } } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/allocated_tensors.struct.toml b/lib/local-execution/include/local-execution/allocated_tensors.struct.toml index 09245097b4..33985b0d74 100644 --- a/lib/local-execution/include/local-execution/allocated_tensors.struct.toml +++ b/lib/local-execution/include/local-execution/allocated_tensors.struct.toml @@ -3,7 +3,6 @@ name = "AllocatedTensors" features = [ "eq", "fmt", - "hash", ] includes = [ diff --git a/lib/local-execution/include/local-execution/local_task_argument_accessor.h b/lib/local-execution/include/local-execution/local_task_argument_accessor.h index c46534330b..d95545d1cc 100644 --- a/lib/local-execution/include/local-execution/local_task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/local_task_argument_accessor.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H -#include "local-execution/task_argument_accessor.h" +#include "task-spec/task_argument_accessor.h" #include "task-spec/slot_tensor_type_id.dtg.h" #include #include diff --git a/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml b/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml index c34063af5d..bd59ec325d 100644 --- a/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml +++ b/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml @@ -3,7 +3,6 @@ name = "LocalTensorBacking" features = [ "eq", "fmt", - "hash" ] includes = [ @@ -15,9 +14,7 @@ includes = [ ] src_includes = [ - "utils/hash/unordered_map.h", "utils/fmt/unordered_map.h", - "utils/hash/vector.h", "utils/fmt/vector.h", ] diff --git a/lib/local-execution/include/local-execution/loss_functions.h b/lib/local-execution/include/local-execution/loss_functions.h index c06908503a..d625088be4 100644 --- a/lib/local-execution/include/local-execution/loss_functions.h +++ b/lib/local-execution/include/local-execution/loss_functions.h @@ -16,7 +16,7 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_ #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_ -#include "local-execution/task_impl_function.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/loss_functions.h" #include "pcg/tensor_guid_t.dtg.h" #include "task-spec/loss_tensor_t.dtg.h" diff --git a/lib/local-execution/include/local-execution/loss_tensor_source.h b/lib/local-execution/include/local-execution/loss_tensor_source.h index d9858cde40..b794207c7f 100644 --- a/lib/local-execution/include/local-execution/loss_tensor_source.h +++ b/lib/local-execution/include/local-execution/loss_tensor_source.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_LOCAL_EXECUTION_LOSS_TENSOR_SOURCE_H #include "task-spec/loss_tensor_t.dtg.h" +#include "utils/nonnegative_int/nonnegative_int.h" namespace FlexFlow { @@ -12,7 +13,7 @@ struct LossTensorSource { loss_tensor_t new_loss_tensor(); private: - static size_t next_available_loss_tensor_id; + static nonnegative_int next_available_loss_tensor_id; }; } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h index f6bd5a3ee9..7b08036059 100644 --- a/lib/local-execution/include/local-execution/optimizer.h +++ b/lib/local-execution/include/local-execution/optimizer.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_ #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_ -#include "local-execution/task_impl_function.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "pcg/optimizer_attrs.dtg.h" #include "pcg/optimizers/adam_optimizer_attrs.dtg.h" #include "pcg/optimizers/sgd_optimizer_attrs.dtg.h" diff --git a/lib/local-execution/include/local-execution/task_registry.struct.toml b/lib/local-execution/include/local-execution/task_registry.struct.toml index c3784b617f..f5daa62090 100644 --- a/lib/local-execution/include/local-execution/task_registry.struct.toml +++ b/lib/local-execution/include/local-execution/task_registry.struct.toml @@ -7,7 +7,7 @@ features = [ ] includes = [ - "local-execution/task_signature_impl.dtg.h", + "task-spec/task_signature_impl.dtg.h", "task-spec/task_id_t.dtg.h", "pcg/layer_guid_t.dtg.h", ] diff --git a/lib/local-execution/src/allocated_tensors.cc b/lib/local-execution/src/allocated_tensors.cc index 196da16ace..d400b4f815 100644 --- a/lib/local-execution/src/allocated_tensors.cc +++ b/lib/local-execution/src/allocated_tensors.cc @@ -35,7 +35,7 @@ bool are_allocated_forward_tensors_valid( if (!is_allocated_tensor_backing_valid( TensorTypeVariant{tensor_guid}, allocated_tensors.tensor_type_backings, - ArrayShape{tensor_attrs.at(tensor_guid).shape})) { + array_shape_from_tensor_shape(tensor_attrs.at(tensor_guid).shape))) { return false; } } else { @@ -59,7 +59,7 @@ bool are_allocated_gradient_tensors_valid( } ArrayShape tensor_guid_array_shape = - ArrayShape{tensor_attrs.at(tensor_to_grad.first).shape}; + array_shape_from_tensor_shape(tensor_attrs.at(tensor_to_grad.first).shape); TensorTypeVariant gradient_tensor = TensorTypeVariant{tensor_to_grad.second}; if (is_allocated_tensor_backing_valid( @@ -101,7 +101,7 @@ bool are_allocated_optimizer_tensors_valid( } ArrayShape tensor_guid_array_shape = - ArrayShape{tensor_attrs.at(tensor_to_optimizers.first).shape}; + array_shape_from_tensor_shape(tensor_attrs.at(tensor_to_optimizers.first).shape); for (optimizer_tensor_t const &optimizer_tensor : tensor_to_optimizers.second) { if (is_allocated_tensor_backing_valid( diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index d508c34210..4b5ee0b782 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -1,7 +1,7 @@ #include "local-execution/local_training_backing.h" #include "local-execution/loss_functions.h" #include "local-execution/optimizer.h" -#include "local-execution/task_signature_impl.h" +#include "task-spec/task_signature_impl.h" #include "local-execution/unallocated_tensors.h" #include "pcg/computation_graph.h" #include "pcg/optimizer_attrs.h" diff --git a/lib/local-execution/src/loss_tensor_source.cc b/lib/local-execution/src/loss_tensor_source.cc index da1efa6b85..f5ce639087 100644 --- a/lib/local-execution/src/loss_tensor_source.cc +++ b/lib/local-execution/src/loss_tensor_source.cc @@ -2,7 +2,7 @@ namespace FlexFlow { -size_t LossTensorSource::next_available_loss_tensor_id = 0; +nonnegative_int LossTensorSource::next_available_loss_tensor_id = 0_n; LossTensorSource::LossTensorSource() {} diff --git a/lib/local-execution/src/task_registry.cc b/lib/local-execution/src/task_registry.cc index 2787342a5f..0acc3d865d 100644 --- a/lib/local-execution/src/task_registry.cc +++ b/lib/local-execution/src/task_registry.cc @@ -1,5 +1,5 @@ #include "local-execution/task_registry.h" -#include "local-execution/task_signature_impl.h" +#include "task-spec/task_signature_impl.h" #include "pcg/computation_graph.h" namespace FlexFlow { diff --git a/lib/local-execution/test/CMakeLists.txt b/lib/local-execution/test/CMakeLists.txt index a973c6967b..0e79376575 100644 --- a/lib/local-execution/test/CMakeLists.txt +++ b/lib/local-execution/test/CMakeLists.txt @@ -11,11 +11,6 @@ ff_add_test_executable( local-execution kernels op-attrs + task-spec ) -set(FF_TEST_EXEC_NAME "local-execution-tests") -add_custom_command( - TARGET ${FF_TEST_EXEC_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -DFF_TEST_EXEC_NAME=${FF_TEST_EXEC_NAME} -P ${CMAKE_CURRENT_LIST_DIR}/modify_test_commands.cmake - DEPENDS ${FF_TEST_EXEC_NAME} -) diff --git a/lib/local-execution/test/modify_test_commands.cmake b/lib/local-execution/test/modify_test_commands.cmake deleted file mode 100644 index 6494ae2d78..0000000000 --- a/lib/local-execution/test/modify_test_commands.cmake +++ /dev/null @@ -1,21 +0,0 @@ -# modify_test_commands.cmake - -file(GLOB ctest_tests_files "${CMAKE_CURRENT_BINARY_DIR}/${FF_TEST_EXEC_NAME}_tests-*.cmake") - -foreach(ctest_tests_file IN LISTS ctest_tests_files) - file(READ "${ctest_tests_file}" content) - - # add nix run prefix - string(REGEX REPLACE - "add_test\\([ \t\r\n]*\\[==\\[([^]]+)\\]==\\][ \t\r\n]+([^ ]+)[ \t\r\n]+\\[==\\[([^]]+)\\]==\\]\\)" - "add_test( [==[\\1]==] nixGL -- \\2 [==[\\3]==])" - content "${content}") - - # add environment - # string(REGEX REPLACE - # "set_tests_properties\\([ \t\r\n]*\\[==\\[([^]]+)\\]==\\][ \t\r\n]+PROPERTIES[ \t\r\n]+([^)]+)\\)" - # "set_tests_properties( [==[\\1]==] PROPERTIES \\2 ENVIRONMENT \"NIXPKGS_ALLOW_UNFREE=1\")" - # content "${content}") - - file(WRITE "${ctest_tests_file}" "${content}") -endforeach() diff --git a/lib/local-execution/test/src/test_e2e.cc b/lib/local-execution/test/src/test_e2e.cc index 5791a94cbb..b527430d67 100644 --- a/lib/local-execution/test/src/test_e2e.cc +++ b/lib/local-execution/test/src/test_e2e.cc @@ -23,8 +23,8 @@ bool did_loss_decrease(float *first_epoch, float *last_epoch, int batch_size) { return true; } -TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("E2ETest") { +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("LocalBackend e2e Training") { // initialize runtime ManagedFFStream managed_stream{}; ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); @@ -47,7 +47,12 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorW label_tensor_backing = allocator.allocate_tensor(output_tensor_shape); AllocatedTensors allocated_tensors = AllocatedTensors{ - {{TensorTypeVariant{label_tensor}, label_tensor_backing}}, {}, {}}; + /*tensor_type_backings=*/{ + {TensorTypeVariant{label_tensor}, label_tensor_backing}, + }, + /*gradient_mapping=*/{}, + /*optimizer_mapping*/{}, + }; // construct computation graph ComputationGraph computation_graph = make_empty_computation_graph(); diff --git a/lib/local-execution/test/src/test_local_task_arg_accessor.cc b/lib/local-execution/test/src/test_local_task_arg_accessor.cc index 9966ca5c10..29b3b432cd 100644 --- a/lib/local-execution/test/src/test_local_task_arg_accessor.cc +++ b/lib/local-execution/test/src/test_local_task_arg_accessor.cc @@ -1,7 +1,7 @@ #include "doctest/doctest.h" #include "kernels/local_cpu_allocator.h" #include "local-execution/local_task_argument_accessor.h" -#include "local-execution/task_signature_impl.h" +#include "task-spec/task_signature_impl.h" #include "utils/fmt/variant.h" using namespace ::FlexFlow; diff --git a/lib/local-execution/test/src/test_task_registry.cc b/lib/local-execution/test/src/test_task_registry.cc index 16877b0e09..c87fd3a899 100644 --- a/lib/local-execution/test/src/test_task_registry.cc +++ b/lib/local-execution/test/src/test_task_registry.cc @@ -1,7 +1,7 @@ #include "doctest/doctest.h" #include "kernels/local_cuda_allocator.h" #include "local-execution/local_cost_estimator.h" -#include "local-execution/task_signature_impl.h" +#include "task-spec/task_signature_impl.h" #include "pcg/computation_graph_builder.h" #include "utils/fmt/optional.h" #include "utils/fmt/unordered_map.h" diff --git a/lib/pcg/include/pcg/metric_attrs.h b/lib/pcg/include/pcg/metric_attrs.h index 343c2154dd..21f9115a67 100644 --- a/lib/pcg/include/pcg/metric_attrs.h +++ b/lib/pcg/include/pcg/metric_attrs.h @@ -1,7 +1,7 @@ #ifndef _FF_METRICS_H_ #define _FF_METRICS_H_ -#include "op-attrs/ops/loss_functions/loss_functions.h" +#include "op-attrs/ops/loss_functions/loss_function.dtg.h" #include "pcg/metric.dtg.h" #include "utils/fmt.h" #include diff --git a/lib/pcg/src/pcg/metric_attrs.cc b/lib/pcg/src/pcg/metric_attrs.cc index 9a93e75350..5357775149 100644 --- a/lib/pcg/src/pcg/metric_attrs.cc +++ b/lib/pcg/src/pcg/metric_attrs.cc @@ -1,4 +1,5 @@ #include "pcg/metric_attrs.h" +#include namespace FlexFlow { MetricsAttrs::MetricsAttrs(LossFunction _loss_type, @@ -29,8 +30,7 @@ MetricsAttrs::MetricsAttrs(LossFunction _loss_type, measure_mean_absolute_error = true; continue; default: - throw mk_runtime_error(fmt::format( - "Initializing MetricsAttrs with unrecogonized metrics type {}", m)); + PANIC("Initializing MetricsAttrs with unrecognized metrics type {}", m); } } } diff --git a/lib/local-execution/include/local-execution/fwd_bwd_op_task_impl_function.h b/lib/task-spec/include/task-spec/fwd_bwd_op_task_impl_function.h similarity index 79% rename from lib/local-execution/include/local-execution/fwd_bwd_op_task_impl_function.h rename to lib/task-spec/include/task-spec/fwd_bwd_op_task_impl_function.h index cc82291f6a..3620ff87cb 100644 --- a/lib/local-execution/include/local-execution/fwd_bwd_op_task_impl_function.h +++ b/lib/task-spec/include/task-spec/fwd_bwd_op_task_impl_function.h @@ -1,7 +1,7 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_FWD_BWD_TASK_IMPL_FUNCTION_H -#define _FLEXFLOW_LOCAL_EXECUTION_FWD_BWD_TASK_IMPL_FUNCTION_H +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_FWD_BWD_OP_TASK_IMPL_FUNCTION_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_FWD_BWD_OP_TASK_IMPL_FUNCTION_H -#include "local-execution/task_argument_accessor.h" +#include "task-spec/task_argument_accessor.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/generic_task_impl_function.h b/lib/task-spec/include/task-spec/generic_task_impl_function.h similarity index 80% rename from lib/local-execution/include/local-execution/generic_task_impl_function.h rename to lib/task-spec/include/task-spec/generic_task_impl_function.h index 9ce22ecf54..b02f4d6beb 100644 --- a/lib/local-execution/include/local-execution/generic_task_impl_function.h +++ b/lib/task-spec/include/task-spec/generic_task_impl_function.h @@ -1,7 +1,7 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_GENERIC_TASK_IMPL_FUNCTION_H -#define _FLEXFLOW_LOCAL_EXECUTION_GENERIC_TASK_IMPL_FUNCTION_H +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_GENERIC_TASK_IMPL_FUNCTION_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_GENERIC_TASK_IMPL_FUNCTION_H -#include "local-execution/task_argument_accessor.h" +#include "task-spec/task_argument_accessor.h" #include "task-spec/device_specific_device_states.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/init_op_task_impl_function.h b/lib/task-spec/include/task-spec/init_op_task_impl_function.h similarity index 81% rename from lib/local-execution/include/local-execution/init_op_task_impl_function.h rename to lib/task-spec/include/task-spec/init_op_task_impl_function.h index 0481e31a5f..f98e972df8 100644 --- a/lib/local-execution/include/local-execution/init_op_task_impl_function.h +++ b/lib/task-spec/include/task-spec/init_op_task_impl_function.h @@ -1,7 +1,7 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_INIT_TASK_IMPL_FUNCTION_H -#define _FLEXFLOW_LOCAL_EXECUTION_INIT_TASK_IMPL_FUNCTION_H +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_INIT_OP_TASK_IMPL_FUNCTION_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_INIT_OP_TASK_IMPL_FUNCTION_H -#include "local-execution/task_argument_accessor.h" +#include "task-spec/task_argument_accessor.h" #include "task-spec/device_specific_device_states.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/itask_argument_accessor.h b/lib/task-spec/include/task-spec/itask_argument_accessor.h similarity index 82% rename from lib/local-execution/include/local-execution/itask_argument_accessor.h rename to lib/task-spec/include/task-spec/itask_argument_accessor.h index 24b3b3a37f..1424b09b84 100644 --- a/lib/local-execution/include/local-execution/itask_argument_accessor.h +++ b/lib/task-spec/include/task-spec/itask_argument_accessor.h @@ -1,8 +1,8 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_ITASK_ARGUMENT_ACCESSOR_H -#define _FLEXFLOW_LOCAL_EXECUTION_ITASK_ARGUMENT_ACCESSOR_H +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_ITASK_ARGUMENT_ACCESSOR_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_ITASK_ARGUMENT_ACCESSOR_H #include "kernels/allocation.h" -#include "local-execution/privilege_tensor_accessor.h" +#include "task-spec/privilege_tensor_accessor.h" #include "task-spec/concrete_arg.h" #include "task-spec/op_task_signature.h" #include "task-spec/tensor_type.dtg.h" diff --git a/lib/task-spec/include/task-spec/loss_tensor_t.struct.toml b/lib/task-spec/include/task-spec/loss_tensor_t.struct.toml index 0d0d428a1b..405385069f 100644 --- a/lib/task-spec/include/task-spec/loss_tensor_t.struct.toml +++ b/lib/task-spec/include/task-spec/loss_tensor_t.struct.toml @@ -7,7 +7,10 @@ features = [ "fmt", ] +includes = [ + "utils/nonnegative_int/nonnegative_int.h" +] [[fields]] name = "raw_index" -type = "int" +type = "::FlexFlow::nonnegative_int" diff --git a/lib/local-execution/include/local-execution/ops/attention.h b/lib/task-spec/include/task-spec/ops/attention.h similarity index 79% rename from lib/local-execution/include/local-execution/ops/attention.h rename to lib/task-spec/include/task-spec/ops/attention.h index bf5385f609..9b0179eeac 100644 --- a/lib/local-execution/include/local-execution/ops/attention.h +++ b/lib/task-spec/include/task-spec/ops/attention.h @@ -1,7 +1,7 @@ -#ifndef _FLEXFLOW_ATTENTION_H -#define _FLEXFLOW_ATTENTION_H +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_ATTENTION_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_ATTENTION_H -#include "local-execution/task_impl_function.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/attention.h" #include "task-spec/op_task_invocation.h" diff --git a/lib/local-execution/include/local-execution/ops/batch_matmul.h b/lib/task-spec/include/task-spec/ops/batch_matmul.h similarity index 75% rename from lib/local-execution/include/local-execution/ops/batch_matmul.h rename to lib/task-spec/include/task-spec/ops/batch_matmul.h index 64d220ab66..e0dc01d3f1 100644 --- a/lib/local-execution/include/local-execution/ops/batch_matmul.h +++ b/lib/task-spec/include/task-spec/ops/batch_matmul.h @@ -1,7 +1,7 @@ -#ifndef _FLEXFLOW_BATCH_MATMUL_H -#define _FLEXFLOW_BATCH_MATMUL_H +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_BATCH_MATMUL_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_BATCH_MATMUL_H -#include "local-execution/task_impl_function.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/batch_matmul_attrs.dtg.h" #include "task-spec/op_task_invocation.h" #include "task-spec/op_task_signature.h" diff --git a/lib/local-execution/include/local-execution/ops/batch_norm.h b/lib/task-spec/include/task-spec/ops/batch_norm.h similarity index 93% rename from lib/local-execution/include/local-execution/ops/batch_norm.h rename to lib/task-spec/include/task-spec/ops/batch_norm.h index 85a7190ce1..081b60318f 100644 --- a/lib/local-execution/include/local-execution/ops/batch_norm.h +++ b/lib/task-spec/include/task-spec/ops/batch_norm.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_BATCH_NORM_H #define _FLEXFLOW_BATCH_NORM_H -#include "local-execution/task_impl_function.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/batch_norm_attrs.dtg.h" #include "task-spec/op_task_invocation.h" diff --git a/lib/local-execution/include/local-execution/ops/cast.h b/lib/task-spec/include/task-spec/ops/cast.h similarity index 95% rename from lib/local-execution/include/local-execution/ops/cast.h rename to lib/task-spec/include/task-spec/ops/cast.h index 6a27ad267a..990624b0e3 100644 --- a/lib/local-execution/include/local-execution/ops/cast.h +++ b/lib/task-spec/include/task-spec/ops/cast.h @@ -15,7 +15,7 @@ #ifndef _FLEXFLOW_CAST_H #define _FLEXFLOW_CAST_H -#include "local-execution/task_impl_function.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/cast_attrs.dtg.h" #include "task-spec/op_task_invocation.h" diff --git a/lib/local-execution/include/local-execution/ops/combine.h b/lib/task-spec/include/task-spec/ops/combine.h similarity index 91% rename from lib/local-execution/include/local-execution/ops/combine.h rename to lib/task-spec/include/task-spec/ops/combine.h index 00e9cbed2c..be16379f36 100644 --- a/lib/local-execution/include/local-execution/ops/combine.h +++ b/lib/task-spec/include/task-spec/ops/combine.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_COMBINE_H #define _FLEXFLOW_COMBINE_H -#include "local-execution/task_impl_function.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/combine_attrs.dtg.h" #include "task-spec/op_task_invocation.h" diff --git a/lib/local-execution/include/local-execution/ops/concat.h b/lib/task-spec/include/task-spec/ops/concat.h similarity index 91% rename from lib/local-execution/include/local-execution/ops/concat.h rename to lib/task-spec/include/task-spec/ops/concat.h index c46164e417..6c7adf76ea 100644 --- a/lib/local-execution/include/local-execution/ops/concat.h +++ b/lib/task-spec/include/task-spec/ops/concat.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_CONCAT_H #define _FLEXFLOW_CONCAT_H -#include "local-execution/task_impl_function.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/concat_attrs.dtg.h" #include "task-spec/op_task_invocation.h" diff --git a/lib/local-execution/include/local-execution/ops/conv_2d.h b/lib/task-spec/include/task-spec/ops/conv_2d.h similarity index 92% rename from lib/local-execution/include/local-execution/ops/conv_2d.h rename to lib/task-spec/include/task-spec/ops/conv_2d.h index f3bb34ffeb..b7fda64961 100644 --- a/lib/local-execution/include/local-execution/ops/conv_2d.h +++ b/lib/task-spec/include/task-spec/ops/conv_2d.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_CONV_2D_H #define _FLEXFLOW_CONV_2D_H -#include "local-execution/task_impl_function.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/conv_2d_attrs.dtg.h" #include "task-spec/op_task_invocation.h" diff --git a/lib/local-execution/include/local-execution/ops/dropout.h b/lib/task-spec/include/task-spec/ops/dropout.h similarity index 93% rename from lib/local-execution/include/local-execution/ops/dropout.h rename to lib/task-spec/include/task-spec/ops/dropout.h index bd7b426c6b..1801b63123 100644 --- a/lib/local-execution/include/local-execution/ops/dropout.h +++ b/lib/task-spec/include/task-spec/ops/dropout.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_DROPOUT_H #define _FLEXFLOW_DROPOUT_H -#include "local-execution/task_impl_function.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/dropout_attrs.dtg.h" #include "task-spec/op_task_invocation.h" #include "task-spec/task_id_t.dtg.h" diff --git a/lib/local-execution/include/local-execution/ops/element_binary.h b/lib/task-spec/include/task-spec/ops/element_binary.h similarity index 73% rename from lib/local-execution/include/local-execution/ops/element_binary.h rename to lib/task-spec/include/task-spec/ops/element_binary.h index 4e0bb46e47..57af54522d 100644 --- a/lib/local-execution/include/local-execution/ops/element_binary.h +++ b/lib/task-spec/include/task-spec/ops/element_binary.h @@ -1,8 +1,8 @@ -#ifndef _FLEXFLOW_ELEMENT_BINARY_H -#define _FLEXFLOW_ELEMENT_BINARY_H +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_ELEMENT_BINARY_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_ELEMENT_BINARY_H -#include "local-execution/task_impl_function.dtg.h" -#include "local-execution/task_signature_impl.h" +#include "task-spec/task_impl_function.dtg.h" +#include "task-spec/task_signature_impl.h" #include "op-attrs/ops/element_binary_attrs.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/ops/element_unary.h b/lib/task-spec/include/task-spec/ops/element_unary.h similarity index 93% rename from lib/local-execution/include/local-execution/ops/element_unary.h rename to lib/task-spec/include/task-spec/ops/element_unary.h index 9900668d6c..f6dcd41455 100644 --- a/lib/local-execution/include/local-execution/ops/element_unary.h +++ b/lib/task-spec/include/task-spec/ops/element_unary.h @@ -1,7 +1,7 @@ #ifndef _ELEMENT_UNARY_H #define _ELEMENT_UNARY_H -#include "local-execution/task_impl_function.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/element_unary_attrs.dtg.h" #include "task-spec/op_task_invocation.h" diff --git a/lib/local-execution/include/local-execution/ops/embedding.h b/lib/task-spec/include/task-spec/ops/embedding.h similarity index 91% rename from lib/local-execution/include/local-execution/ops/embedding.h rename to lib/task-spec/include/task-spec/ops/embedding.h index b998aef53e..3a80d38398 100644 --- a/lib/local-execution/include/local-execution/ops/embedding.h +++ b/lib/task-spec/include/task-spec/ops/embedding.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_EMBEDDING_H #define _FLEXFLOW_EMBEDDING_H -#include "local-execution/task_impl_function.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/embedding_attrs.dtg.h" #include "task-spec/op_task_invocation.h" diff --git a/lib/local-execution/include/local-execution/ops/flat.h b/lib/task-spec/include/task-spec/ops/flat.h similarity index 90% rename from lib/local-execution/include/local-execution/ops/flat.h rename to lib/task-spec/include/task-spec/ops/flat.h index 95afb98340..6ac72ccd6b 100644 --- a/lib/local-execution/include/local-execution/ops/flat.h +++ b/lib/task-spec/include/task-spec/ops/flat.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_FLAT_H #define _FLEXFLOW_FLAT_H -#include "local-execution/task_impl_function.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/flat_attrs.dtg.h" #include "task-spec/op_task_invocation.h" diff --git a/lib/local-execution/include/local-execution/ops/gather.h b/lib/task-spec/include/task-spec/ops/gather.h similarity index 92% rename from lib/local-execution/include/local-execution/ops/gather.h rename to lib/task-spec/include/task-spec/ops/gather.h index 5569a94728..c5ccc4ccdb 100644 --- a/lib/local-execution/include/local-execution/ops/gather.h +++ b/lib/task-spec/include/task-spec/ops/gather.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_GATHER_H #define _FLEXFLOW_GATHER_H -#include "local-execution/task_impl_function.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/gather_attrs.dtg.h" #include "task-spec/op_task_invocation.h" diff --git a/lib/local-execution/include/local-execution/ops/input.h b/lib/task-spec/include/task-spec/ops/input.h similarity index 100% rename from lib/local-execution/include/local-execution/ops/input.h rename to lib/task-spec/include/task-spec/ops/input.h diff --git a/lib/local-execution/include/local-execution/ops/layer_norm.h b/lib/task-spec/include/task-spec/ops/layer_norm.h similarity index 93% rename from lib/local-execution/include/local-execution/ops/layer_norm.h rename to lib/task-spec/include/task-spec/ops/layer_norm.h index e4a15caac2..81af0c360f 100644 --- a/lib/local-execution/include/local-execution/ops/layer_norm.h +++ b/lib/task-spec/include/task-spec/ops/layer_norm.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_RUNTIME_SRC_OPS_LAYER_NORM_H #define _FLEXFLOW_RUNTIME_SRC_OPS_LAYER_NORM_H -#include "local-execution/task_impl_function.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/layer_norm_attrs.dtg.h" #include "task-spec/op_task_invocation.h" diff --git a/lib/local-execution/include/local-execution/ops/linear.h b/lib/task-spec/include/task-spec/ops/linear.h similarity index 92% rename from lib/local-execution/include/local-execution/ops/linear.h rename to lib/task-spec/include/task-spec/ops/linear.h index d58d876865..69197fd627 100644 --- a/lib/local-execution/include/local-execution/ops/linear.h +++ b/lib/task-spec/include/task-spec/ops/linear.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_LINEAR_H #define _FLEXFLOW_LINEAR_H -#include "local-execution/task_impl_function.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/linear_attrs.dtg.h" #include "task-spec/op_task_invocation.h" diff --git a/lib/local-execution/include/local-execution/ops/noop.h b/lib/task-spec/include/task-spec/ops/noop.h similarity index 100% rename from lib/local-execution/include/local-execution/ops/noop.h rename to lib/task-spec/include/task-spec/ops/noop.h diff --git a/lib/local-execution/include/local-execution/ops/parallel_op.h b/lib/task-spec/include/task-spec/ops/parallel_op.h similarity index 100% rename from lib/local-execution/include/local-execution/ops/parallel_op.h rename to lib/task-spec/include/task-spec/ops/parallel_op.h diff --git a/lib/local-execution/include/local-execution/ops/pool_2d.h b/lib/task-spec/include/task-spec/ops/pool_2d.h similarity index 92% rename from lib/local-execution/include/local-execution/ops/pool_2d.h rename to lib/task-spec/include/task-spec/ops/pool_2d.h index 7d0ec44bd7..a3601e8800 100644 --- a/lib/local-execution/include/local-execution/ops/pool_2d.h +++ b/lib/task-spec/include/task-spec/ops/pool_2d.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_POOL_2D_H #define _FLEXFLOW_POOL_2D_H -#include "local-execution/task_impl_function.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/pool_2d_attrs.dtg.h" #include "task-spec/op_task_invocation.h" diff --git a/lib/local-execution/include/local-execution/ops/reduce.h b/lib/task-spec/include/task-spec/ops/reduce.h similarity index 93% rename from lib/local-execution/include/local-execution/ops/reduce.h rename to lib/task-spec/include/task-spec/ops/reduce.h index 5c6d4be338..e44c0f283f 100644 --- a/lib/local-execution/include/local-execution/ops/reduce.h +++ b/lib/task-spec/include/task-spec/ops/reduce.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H #define _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H -#include "local-execution/task_impl_function.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/reduce_attrs.dtg.h" #include "task-spec/op_task_invocation.h" diff --git a/lib/local-execution/include/local-execution/ops/reduction.h b/lib/task-spec/include/task-spec/ops/reduction.h similarity index 92% rename from lib/local-execution/include/local-execution/ops/reduction.h rename to lib/task-spec/include/task-spec/ops/reduction.h index 7475d3aeb4..cba90c37bb 100644 --- a/lib/local-execution/include/local-execution/ops/reduction.h +++ b/lib/task-spec/include/task-spec/ops/reduction.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_REDUCTION_H #define _FLEXFLOW_REDUCTION_H -#include "local-execution/task_impl_function.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/reduction_attrs.dtg.h" #include "task-spec/op_task_invocation.h" diff --git a/lib/local-execution/include/local-execution/ops/repartition.h b/lib/task-spec/include/task-spec/ops/repartition.h similarity index 93% rename from lib/local-execution/include/local-execution/ops/repartition.h rename to lib/task-spec/include/task-spec/ops/repartition.h index 08ecdafcf2..f43cf13179 100644 --- a/lib/local-execution/include/local-execution/ops/repartition.h +++ b/lib/task-spec/include/task-spec/ops/repartition.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_PARTITION_H #define _FLEXFLOW_PARTITION_H -#include "local-execution/task_impl_function.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/repartition_attrs.dtg.h" #include "task-spec/op_task_invocation.h" diff --git a/lib/local-execution/include/local-execution/ops/replicate.h b/lib/task-spec/include/task-spec/ops/replicate.h similarity index 91% rename from lib/local-execution/include/local-execution/ops/replicate.h rename to lib/task-spec/include/task-spec/ops/replicate.h index b827b9c272..0086dad741 100644 --- a/lib/local-execution/include/local-execution/ops/replicate.h +++ b/lib/task-spec/include/task-spec/ops/replicate.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_REPLICATE_H #define _FLEXFLOW_REPLICATE_H -#include "local-execution/task_impl_function.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/replicate_attrs.dtg.h" #include "task-spec/op_task_invocation.h" diff --git a/lib/local-execution/include/local-execution/ops/reshape.h b/lib/task-spec/include/task-spec/ops/reshape.h similarity index 92% rename from lib/local-execution/include/local-execution/ops/reshape.h rename to lib/task-spec/include/task-spec/ops/reshape.h index ed7e6e9e31..f192d83b9a 100644 --- a/lib/local-execution/include/local-execution/ops/reshape.h +++ b/lib/task-spec/include/task-spec/ops/reshape.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_RESHAPE_H #define _FLEXFLOW_RESHAPE_H -#include "local-execution/task_impl_function.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/reshape_attrs.dtg.h" #include "task-spec/op_task_invocation.h" diff --git a/lib/local-execution/include/local-execution/ops/reverse.h b/lib/task-spec/include/task-spec/ops/reverse.h similarity index 91% rename from lib/local-execution/include/local-execution/ops/reverse.h rename to lib/task-spec/include/task-spec/ops/reverse.h index dd0e89ecad..bb123b63f5 100644 --- a/lib/local-execution/include/local-execution/ops/reverse.h +++ b/lib/task-spec/include/task-spec/ops/reverse.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_REVERSE_H_ #define _FLEXFLOW_REVERSE_H_ -#include "local-execution/task_impl_function.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/reverse_attrs.dtg.h" #include "task-spec/op_task_invocation.h" diff --git a/lib/local-execution/include/local-execution/ops/softmax.h b/lib/task-spec/include/task-spec/ops/softmax.h similarity index 78% rename from lib/local-execution/include/local-execution/ops/softmax.h rename to lib/task-spec/include/task-spec/ops/softmax.h index 294d948b42..528dd5da0b 100644 --- a/lib/local-execution/include/local-execution/ops/softmax.h +++ b/lib/task-spec/include/task-spec/ops/softmax.h @@ -1,7 +1,7 @@ -#ifndef _FLEXFLOW_SOFTMAX_H -#define _FLEXFLOW_SOFTMAX_H +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_SOFTMAX_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_SOFTMAX_H -#include "local-execution/task_impl_function.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/softmax_attrs.dtg.h" #include "task-spec/op_task_invocation.h" diff --git a/lib/local-execution/include/local-execution/ops/split.h b/lib/task-spec/include/task-spec/ops/split.h similarity index 90% rename from lib/local-execution/include/local-execution/ops/split.h rename to lib/task-spec/include/task-spec/ops/split.h index 49cd7cfc7b..ed92f2925e 100644 --- a/lib/local-execution/include/local-execution/ops/split.h +++ b/lib/task-spec/include/task-spec/ops/split.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_SPLIT_H #define _FLEXFLOW_SPLIT_H -#include "local-execution/task_impl_function.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/split_attrs.dtg.h" #include "task-spec/op_task_invocation.h" diff --git a/lib/local-execution/include/local-execution/ops/topk.h b/lib/task-spec/include/task-spec/ops/topk.h similarity index 92% rename from lib/local-execution/include/local-execution/ops/topk.h rename to lib/task-spec/include/task-spec/ops/topk.h index aeded512cd..8afe98d568 100644 --- a/lib/local-execution/include/local-execution/ops/topk.h +++ b/lib/task-spec/include/task-spec/ops/topk.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_TOPK_H_ #define _FLEXFLOW_TOPK_H_ -#include "local-execution/task_impl_function.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/topk_attrs.dtg.h" #include "task-spec/op_task_invocation.h" diff --git a/lib/local-execution/include/local-execution/ops/transpose.h b/lib/task-spec/include/task-spec/ops/transpose.h similarity index 91% rename from lib/local-execution/include/local-execution/ops/transpose.h rename to lib/task-spec/include/task-spec/ops/transpose.h index 2c7b5fb3bc..dec29f4b36 100644 --- a/lib/local-execution/include/local-execution/ops/transpose.h +++ b/lib/task-spec/include/task-spec/ops/transpose.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_TRANSPOSE_H_ #define _FLEXFLOW_TRANSPOSE_H_ -#include "local-execution/task_impl_function.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/transpose_attrs.dtg.h" #include "task-spec/op_task_invocation.h" diff --git a/lib/local-execution/include/local-execution/ops/weight.h b/lib/task-spec/include/task-spec/ops/weight.h similarity index 100% rename from lib/local-execution/include/local-execution/ops/weight.h rename to lib/task-spec/include/task-spec/ops/weight.h diff --git a/lib/local-execution/include/local-execution/permissions.h b/lib/task-spec/include/task-spec/permissions.h similarity index 90% rename from lib/local-execution/include/local-execution/permissions.h rename to lib/task-spec/include/task-spec/permissions.h index f34969f233..d1ae5fc349 100644 --- a/lib/local-execution/include/local-execution/permissions.h +++ b/lib/task-spec/include/task-spec/permissions.h @@ -1,5 +1,5 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_PERMISSION_H -#define _FLEXFLOW_LOCAL_EXECUTION_PERMISSION_H +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_PERMISSIONS_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_PERMISSIONS_H #include "utils/exception.h" #include "utils/fmt.h" diff --git a/lib/local-execution/include/local-execution/privilege_tensor_accessor.h b/lib/task-spec/include/task-spec/privilege_tensor_accessor.h similarity index 81% rename from lib/local-execution/include/local-execution/privilege_tensor_accessor.h rename to lib/task-spec/include/task-spec/privilege_tensor_accessor.h index aeae3c2e41..171b0fcd39 100644 --- a/lib/local-execution/include/local-execution/privilege_tensor_accessor.h +++ b/lib/task-spec/include/task-spec/privilege_tensor_accessor.h @@ -1,8 +1,8 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_PRIVILEGE_TENSOR_ACCESSOR_H -#define _FLEXFLOW_LOCAL_EXECUTION_PRIVILEGE_TENSOR_ACCESSOR_H +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_PRIVILEGE_TENSOR_ACCESSOR_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_PRIVILEGE_TENSOR_ACCESSOR_H #include "kernels/accessor.h" -#include "local-execution/permissions.h" +#include "task-spec/permissions.h" namespace FlexFlow { diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/task-spec/include/task-spec/task_argument_accessor.h similarity index 96% rename from lib/local-execution/include/local-execution/task_argument_accessor.h rename to lib/task-spec/include/task-spec/task_argument_accessor.h index 499b5ff7d6..2cac3a5dd8 100644 --- a/lib/local-execution/include/local-execution/task_argument_accessor.h +++ b/lib/task-spec/include/task-spec/task_argument_accessor.h @@ -1,7 +1,7 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H -#define _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TASK_ARGUMENT_ACCESSOR_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TASK_ARGUMENT_ACCESSOR_H -#include "local-execution/itask_argument_accessor.h" +#include "task-spec/itask_argument_accessor.h" #include "task-spec/device_specific.h" #include "task-spec/per_device_op_state.dtg.h" diff --git a/lib/local-execution/include/local-execution/task_impl_function.variant.toml b/lib/task-spec/include/task-spec/task_impl_function.variant.toml similarity index 72% rename from lib/local-execution/include/local-execution/task_impl_function.variant.toml rename to lib/task-spec/include/task-spec/task_impl_function.variant.toml index 48cab9eb01..74347a3290 100644 --- a/lib/local-execution/include/local-execution/task_impl_function.variant.toml +++ b/lib/task-spec/include/task-spec/task_impl_function.variant.toml @@ -8,9 +8,9 @@ features = [ ] includes = [ - "local-execution/init_op_task_impl_function.h", - "local-execution/fwd_bwd_op_task_impl_function.h", - "local-execution/generic_task_impl_function.h", + "task-spec/init_op_task_impl_function.h", + "task-spec/fwd_bwd_op_task_impl_function.h", + "task-spec/generic_task_impl_function.h", ] [[values]] diff --git a/lib/local-execution/include/local-execution/task_signature_impl.h b/lib/task-spec/include/task-spec/task_signature_impl.h similarity index 71% rename from lib/local-execution/include/local-execution/task_signature_impl.h rename to lib/task-spec/include/task-spec/task_signature_impl.h index 613a173f25..ee093c7d23 100644 --- a/lib/local-execution/include/local-execution/task_signature_impl.h +++ b/lib/task-spec/include/task-spec/task_signature_impl.h @@ -1,7 +1,7 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_SIGNATURE_IMPL_H -#define _FLEXFLOW_LOCAL_EXECUTION_TASK_SIGNATURE_IMPL_H +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TASK_SIGNATURE_IMPL_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TASK_SIGNATURE_IMPL_H -#include "local-execution/task_signature_impl.dtg.h" +#include "task-spec/task_signature_impl.dtg.h" #include "op-attrs/computation_graph_op_attrs.h" #include "task-spec/op_task_invocation.h" #include "task-spec/task_id_t.dtg.h" diff --git a/lib/local-execution/include/local-execution/task_signature_impl.struct.toml b/lib/task-spec/include/task-spec/task_signature_impl.struct.toml similarity index 86% rename from lib/local-execution/include/local-execution/task_signature_impl.struct.toml rename to lib/task-spec/include/task-spec/task_signature_impl.struct.toml index 78064203ec..574f11a084 100644 --- a/lib/local-execution/include/local-execution/task_signature_impl.struct.toml +++ b/lib/task-spec/include/task-spec/task_signature_impl.struct.toml @@ -7,7 +7,7 @@ features = [ ] includes = [ - "local-execution/task_impl_function.dtg.h", + "task-spec/task_impl_function.dtg.h", "task-spec/op_task_signature.h", ] diff --git a/lib/task-spec/src/concrete_arg.cc b/lib/task-spec/src/task-spec/concrete_arg.cc similarity index 100% rename from lib/task-spec/src/concrete_arg.cc rename to lib/task-spec/src/task-spec/concrete_arg.cc diff --git a/lib/local-execution/src/fwd_bwd_op_task_impl_function.cc b/lib/task-spec/src/task-spec/fwd_bwd_op_task_impl_function.cc similarity index 96% rename from lib/local-execution/src/fwd_bwd_op_task_impl_function.cc rename to lib/task-spec/src/task-spec/fwd_bwd_op_task_impl_function.cc index 308dbfd3ae..3450b5d268 100644 --- a/lib/local-execution/src/fwd_bwd_op_task_impl_function.cc +++ b/lib/task-spec/src/task-spec/fwd_bwd_op_task_impl_function.cc @@ -1,4 +1,4 @@ -#include "local-execution/fwd_bwd_op_task_impl_function.h" +#include "task-spec/fwd_bwd_op_task_impl_function.h" namespace FlexFlow { diff --git a/lib/local-execution/src/generic_task_impl_function.cc b/lib/task-spec/src/task-spec/generic_task_impl_function.cc similarity index 96% rename from lib/local-execution/src/generic_task_impl_function.cc rename to lib/task-spec/src/task-spec/generic_task_impl_function.cc index 87d4db53e6..4abd1ab644 100644 --- a/lib/local-execution/src/generic_task_impl_function.cc +++ b/lib/task-spec/src/task-spec/generic_task_impl_function.cc @@ -1,4 +1,4 @@ -#include "local-execution/generic_task_impl_function.h" +#include "task-spec/generic_task_impl_function.h" namespace FlexFlow { diff --git a/lib/local-execution/src/init_op_task_impl_function.cc b/lib/task-spec/src/task-spec/init_op_task_impl_function.cc similarity index 96% rename from lib/local-execution/src/init_op_task_impl_function.cc rename to lib/task-spec/src/task-spec/init_op_task_impl_function.cc index abe84b828e..4cd55fc488 100644 --- a/lib/local-execution/src/init_op_task_impl_function.cc +++ b/lib/task-spec/src/task-spec/init_op_task_impl_function.cc @@ -1,4 +1,4 @@ -#include "local-execution/init_op_task_impl_function.h" +#include "task-spec/init_op_task_impl_function.h" namespace FlexFlow { diff --git a/lib/task-spec/src/task-spec/itask_argument_accessor.cc b/lib/task-spec/src/task-spec/itask_argument_accessor.cc new file mode 100644 index 0000000000..c7878b1abc --- /dev/null +++ b/lib/task-spec/src/task-spec/itask_argument_accessor.cc @@ -0,0 +1 @@ +#include "task-spec/itask_argument_accessor.h" diff --git a/lib/task-spec/src/op_arg_ref.cc b/lib/task-spec/src/task-spec/op_arg_ref.cc similarity index 100% rename from lib/task-spec/src/op_arg_ref.cc rename to lib/task-spec/src/task-spec/op_arg_ref.cc diff --git a/lib/task-spec/src/op_arg_spec.cc b/lib/task-spec/src/task-spec/op_arg_spec.cc similarity index 100% rename from lib/task-spec/src/op_arg_spec.cc rename to lib/task-spec/src/task-spec/op_arg_spec.cc diff --git a/lib/task-spec/src/op_task_invocation.cc b/lib/task-spec/src/task-spec/op_task_invocation.cc similarity index 100% rename from lib/task-spec/src/op_task_invocation.cc rename to lib/task-spec/src/task-spec/op_task_invocation.cc diff --git a/lib/task-spec/src/op_task_signature.cc b/lib/task-spec/src/task-spec/op_task_signature.cc similarity index 100% rename from lib/task-spec/src/op_task_signature.cc rename to lib/task-spec/src/task-spec/op_task_signature.cc diff --git a/lib/task-spec/src/op_task_to_task_invocation.cc b/lib/task-spec/src/task-spec/op_task_to_task_invocation.cc similarity index 100% rename from lib/task-spec/src/op_task_to_task_invocation.cc rename to lib/task-spec/src/task-spec/op_task_to_task_invocation.cc diff --git a/lib/task-spec/src/op_tensor_spec.cc b/lib/task-spec/src/task-spec/op_tensor_spec.cc similarity index 100% rename from lib/task-spec/src/op_tensor_spec.cc rename to lib/task-spec/src/task-spec/op_tensor_spec.cc diff --git a/lib/local-execution/src/local-execution/ops/attention.cc b/lib/task-spec/src/task-spec/ops/attention.cc similarity index 99% rename from lib/local-execution/src/local-execution/ops/attention.cc rename to lib/task-spec/src/task-spec/ops/attention.cc index a9e6a9fa30..01960803ce 100644 --- a/lib/local-execution/src/local-execution/ops/attention.cc +++ b/lib/task-spec/src/task-spec/ops/attention.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "local-execution/ops/attention.h" +#include "task-spec/ops/attention.h" #include "kernels/attention_kernels.h" #include "op-attrs/ops/attention.h" #include "op-attrs/ops/attention/multihead_attention_parallel_inputs.h" diff --git a/lib/local-execution/src/local-execution/ops/batch_matmul.cc b/lib/task-spec/src/task-spec/ops/batch_matmul.cc similarity index 99% rename from lib/local-execution/src/local-execution/ops/batch_matmul.cc rename to lib/task-spec/src/task-spec/ops/batch_matmul.cc index 2cbf1cf20f..371c80d7e2 100644 --- a/lib/local-execution/src/local-execution/ops/batch_matmul.cc +++ b/lib/task-spec/src/task-spec/ops/batch_matmul.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "local-execution/ops/batch_matmul.h" +#include "task-spec/ops/batch_matmul.h" #include "kernels/batch_matmul_kernels.h" #include "op-attrs/ops/batch_matmul.h" #include "task-spec/op_task_signature.h" diff --git a/lib/local-execution/src/local-execution/ops/batch_norm.cc b/lib/task-spec/src/task-spec/ops/batch_norm.cc similarity index 99% rename from lib/local-execution/src/local-execution/ops/batch_norm.cc rename to lib/task-spec/src/task-spec/ops/batch_norm.cc index 7ba62bcc59..2aa308dada 100644 --- a/lib/local-execution/src/local-execution/ops/batch_norm.cc +++ b/lib/task-spec/src/task-spec/ops/batch_norm.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "local-execution/ops/batch_norm.h" +#include "task-spec/ops/batch_norm.h" #include "kernels/batch_norm_kernels.h" namespace FlexFlow { diff --git a/lib/local-execution/src/local-execution/ops/cast.cc b/lib/task-spec/src/task-spec/ops/cast.cc similarity index 98% rename from lib/local-execution/src/local-execution/ops/cast.cc rename to lib/task-spec/src/task-spec/ops/cast.cc index 752317d722..7cf26be95b 100644 --- a/lib/local-execution/src/local-execution/ops/cast.cc +++ b/lib/task-spec/src/task-spec/ops/cast.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "local-execution/ops/cast.h" +#include "task-spec/ops/cast.h" #include "kernels/cast_kernels.h" #include "task-spec/op_task_signature.h" diff --git a/lib/local-execution/src/local-execution/ops/combine.cc b/lib/task-spec/src/task-spec/ops/combine.cc similarity index 98% rename from lib/local-execution/src/local-execution/ops/combine.cc rename to lib/task-spec/src/task-spec/ops/combine.cc index 32fab636d3..41c276facb 100644 --- a/lib/local-execution/src/local-execution/ops/combine.cc +++ b/lib/task-spec/src/task-spec/ops/combine.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "local-execution/ops/combine.h" +#include "task-spec/ops/combine.h" #include "kernels/combine_kernels.h" #include "task-spec/op_task_invocation.h" #include "utils/hash-utils.h" diff --git a/lib/local-execution/src/local-execution/ops/concat.cc b/lib/task-spec/src/task-spec/ops/concat.cc similarity index 98% rename from lib/local-execution/src/local-execution/ops/concat.cc rename to lib/task-spec/src/task-spec/ops/concat.cc index 8531bf77c0..2cb082d1eb 100644 --- a/lib/local-execution/src/local-execution/ops/concat.cc +++ b/lib/task-spec/src/task-spec/ops/concat.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "local-execution/ops/concat.h" +#include "task-spec/ops/concat.h" #include "kernels/concat_kernels.h" #include "task-spec/op_task_signature.h" #include "task-spec/variadic_tensor_ref.h" diff --git a/lib/local-execution/src/local-execution/ops/conv_2d.cc b/lib/task-spec/src/task-spec/ops/conv_2d.cc similarity index 99% rename from lib/local-execution/src/local-execution/ops/conv_2d.cc rename to lib/task-spec/src/task-spec/ops/conv_2d.cc index cc0febff24..47b889c6ce 100644 --- a/lib/local-execution/src/local-execution/ops/conv_2d.cc +++ b/lib/task-spec/src/task-spec/ops/conv_2d.cc @@ -1,4 +1,4 @@ -#include "local-execution/ops/conv_2d.h" +#include "task-spec/ops/conv_2d.h" #include "kernels/conv_2d_kernels.h" namespace FlexFlow { diff --git a/lib/local-execution/src/local-execution/ops/dropout.cc b/lib/task-spec/src/task-spec/ops/dropout.cc similarity index 99% rename from lib/local-execution/src/local-execution/ops/dropout.cc rename to lib/task-spec/src/task-spec/ops/dropout.cc index cc09841190..d19ace886b 100644 --- a/lib/local-execution/src/local-execution/ops/dropout.cc +++ b/lib/task-spec/src/task-spec/ops/dropout.cc @@ -1,4 +1,4 @@ -#include "local-execution/ops/dropout.h" +#include "task-spec/ops/dropout.h" #include "kernels/dropout_kernels.h" #include "task-spec/op_task_invocation.h" #include "task-spec/op_task_signature.h" diff --git a/lib/local-execution/src/local-execution/ops/element_binary.cc b/lib/task-spec/src/task-spec/ops/element_binary.cc similarity index 98% rename from lib/local-execution/src/local-execution/ops/element_binary.cc rename to lib/task-spec/src/task-spec/ops/element_binary.cc index ec8ed298d0..5356901423 100644 --- a/lib/local-execution/src/local-execution/ops/element_binary.cc +++ b/lib/task-spec/src/task-spec/ops/element_binary.cc @@ -1,6 +1,6 @@ -#include "local-execution/ops/element_binary.h" +#include "task-spec/ops/element_binary.h" #include "kernels/element_binary_kernels.h" -#include "local-execution/task_signature_impl.h" +#include "task-spec/task_signature_impl.h" #include "utils/hash-utils.h" namespace FlexFlow { diff --git a/lib/local-execution/src/local-execution/ops/element_unary.cc b/lib/task-spec/src/task-spec/ops/element_unary.cc similarity index 99% rename from lib/local-execution/src/local-execution/ops/element_unary.cc rename to lib/task-spec/src/task-spec/ops/element_unary.cc index 4cf54e5b38..1f4e651251 100644 --- a/lib/local-execution/src/local-execution/ops/element_unary.cc +++ b/lib/task-spec/src/task-spec/ops/element_unary.cc @@ -1,4 +1,4 @@ -#include "local-execution/ops/element_unary.h" +#include "task-spec/ops/element_unary.h" #include "kernels/element_unary_kernels.h" #include "op-attrs/parallel_tensor_shape.h" #include "utils/hash-utils.h" diff --git a/lib/local-execution/src/local-execution/ops/flat.cc b/lib/task-spec/src/task-spec/ops/flat.cc similarity index 98% rename from lib/local-execution/src/local-execution/ops/flat.cc rename to lib/task-spec/src/task-spec/ops/flat.cc index 414a56769d..1bc0999e1a 100644 --- a/lib/local-execution/src/local-execution/ops/flat.cc +++ b/lib/task-spec/src/task-spec/ops/flat.cc @@ -1,4 +1,4 @@ -#include "local-execution/ops/flat.h" +#include "task-spec/ops/flat.h" #include "kernels/flat_kernels.h" namespace FlexFlow { diff --git a/lib/local-execution/src/local-execution/ops/gather.cc b/lib/task-spec/src/task-spec/ops/gather.cc similarity index 99% rename from lib/local-execution/src/local-execution/ops/gather.cc rename to lib/task-spec/src/task-spec/ops/gather.cc index 7e4b99a557..a0bfaddc0f 100644 --- a/lib/local-execution/src/local-execution/ops/gather.cc +++ b/lib/task-spec/src/task-spec/ops/gather.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "local-execution/ops/gather.h" +#include "task-spec/ops/gather.h" #include "kernels/gather_kernels.h" #include "utils/nonnegative_int/nonnegative_range.h" #include diff --git a/lib/local-execution/src/local-execution/ops/input.cc b/lib/task-spec/src/task-spec/ops/input.cc similarity index 76% rename from lib/local-execution/src/local-execution/ops/input.cc rename to lib/task-spec/src/task-spec/ops/input.cc index d7a3888220..53caadfe68 100644 --- a/lib/local-execution/src/local-execution/ops/input.cc +++ b/lib/task-spec/src/task-spec/ops/input.cc @@ -1,4 +1,4 @@ -#include "local-execution/ops/input.h" +#include "task-spec/ops/input.h" namespace FlexFlow { diff --git a/lib/local-execution/src/local-execution/ops/layer_norm.cc b/lib/task-spec/src/task-spec/ops/layer_norm.cc similarity index 99% rename from lib/local-execution/src/local-execution/ops/layer_norm.cc rename to lib/task-spec/src/task-spec/ops/layer_norm.cc index d2fc930375..c2f16d7eda 100644 --- a/lib/local-execution/src/local-execution/ops/layer_norm.cc +++ b/lib/task-spec/src/task-spec/ops/layer_norm.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "local-execution/ops/layer_norm.h" +#include "task-spec/ops/layer_norm.h" #include "kernels/layer_norm_kernels.h" #include "op-attrs/ops/layer_norm.h" #include "op-attrs/parallel_tensor_shape.h" diff --git a/lib/local-execution/src/local-execution/ops/linear.cc b/lib/task-spec/src/task-spec/ops/linear.cc similarity index 98% rename from lib/local-execution/src/local-execution/ops/linear.cc rename to lib/task-spec/src/task-spec/ops/linear.cc index 96fcc85ca1..8d4a81c5c4 100644 --- a/lib/local-execution/src/local-execution/ops/linear.cc +++ b/lib/task-spec/src/task-spec/ops/linear.cc @@ -1,6 +1,6 @@ -#include "local-execution/ops/linear.h" +#include "task-spec/ops/linear.h" #include "kernels/linear_kernels.h" -#include "local-execution/task_argument_accessor.h" +#include "task-spec/task_argument_accessor.h" #include "op-attrs/ff_dim_t.h" #include "utils/exception.h" #include "utils/hash-utils.h" diff --git a/lib/local-execution/src/local-execution/ops/noop.cc b/lib/task-spec/src/task-spec/ops/noop.cc similarity index 95% rename from lib/local-execution/src/local-execution/ops/noop.cc rename to lib/task-spec/src/task-spec/ops/noop.cc index 7357806880..4d69b8fd5f 100644 --- a/lib/local-execution/src/local-execution/ops/noop.cc +++ b/lib/task-spec/src/task-spec/ops/noop.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "local-execution/ops/noop.h" +#include "task-spec/ops/noop.h" namespace FlexFlow { diff --git a/lib/local-execution/src/local-execution/ops/pool_2d.cc b/lib/task-spec/src/task-spec/ops/pool_2d.cc similarity index 99% rename from lib/local-execution/src/local-execution/ops/pool_2d.cc rename to lib/task-spec/src/task-spec/ops/pool_2d.cc index 6db1cf9dc3..d7064ca04d 100644 --- a/lib/local-execution/src/local-execution/ops/pool_2d.cc +++ b/lib/task-spec/src/task-spec/ops/pool_2d.cc @@ -1,6 +1,5 @@ -#include "local-execution/ops/pool_2d.h" +#include "task-spec/ops/pool_2d.h" #include "kernels/pool_2d_kernels.h" - #include "op-attrs/ops/pool_2d.h" #include "utils/exception.h" #include "utils/hash-utils.h" diff --git a/lib/local-execution/src/local-execution/ops/reduce.cc b/lib/task-spec/src/task-spec/ops/reduce.cc similarity index 99% rename from lib/local-execution/src/local-execution/ops/reduce.cc rename to lib/task-spec/src/task-spec/ops/reduce.cc index bc4b5343c2..ccc1285aaa 100644 --- a/lib/local-execution/src/local-execution/ops/reduce.cc +++ b/lib/task-spec/src/task-spec/ops/reduce.cc @@ -1,6 +1,5 @@ -#include "local-execution/ops/reduce.h" +#include "task-spec/ops/reduce.h" #include "kernels/reduce_kernels.h" - #include "utils/exception.h" #include "utils/hash-utils.h" #include "utils/type_traits_core.h" diff --git a/lib/local-execution/src/local-execution/ops/reduction.cc b/lib/task-spec/src/task-spec/ops/reduction.cc similarity index 98% rename from lib/local-execution/src/local-execution/ops/reduction.cc rename to lib/task-spec/src/task-spec/ops/reduction.cc index 340f695ffb..96e2c6c506 100644 --- a/lib/local-execution/src/local-execution/ops/reduction.cc +++ b/lib/task-spec/src/task-spec/ops/reduction.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "local-execution/ops/reduction.h" +#include "task-spec/ops/reduction.h" #include "kernels/reduction_kernels.h" #include "utils/exception.h" #include "utils/hash-utils.h" diff --git a/lib/local-execution/src/local-execution/ops/repartition.cc b/lib/task-spec/src/task-spec/ops/repartition.cc similarity index 99% rename from lib/local-execution/src/local-execution/ops/repartition.cc rename to lib/task-spec/src/task-spec/ops/repartition.cc index 942f2d8fee..cfc45dede7 100644 --- a/lib/local-execution/src/local-execution/ops/repartition.cc +++ b/lib/task-spec/src/task-spec/ops/repartition.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "local-execution/ops/repartition.h" +#include "task-spec/ops/repartition.h" #include "kernels/partition_kernels.h" #include "utils/exception.h" #include "utils/hash-utils.h" diff --git a/lib/local-execution/src/local-execution/ops/replicate.cc b/lib/task-spec/src/task-spec/ops/replicate.cc similarity index 98% rename from lib/local-execution/src/local-execution/ops/replicate.cc rename to lib/task-spec/src/task-spec/ops/replicate.cc index 13a4fd1635..0ed5d98708 100644 --- a/lib/local-execution/src/local-execution/ops/replicate.cc +++ b/lib/task-spec/src/task-spec/ops/replicate.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "local-execution/ops/replicate.h" +#include "task-spec/ops/replicate.h" #include "kernels/replicate_kernels.h" #include "op-attrs/parallel_tensor_shape.h" #include "utils/exception.h" diff --git a/lib/local-execution/src/local-execution/ops/reshape.cc b/lib/task-spec/src/task-spec/ops/reshape.cc similarity index 99% rename from lib/local-execution/src/local-execution/ops/reshape.cc rename to lib/task-spec/src/task-spec/ops/reshape.cc index 294e207f00..0b43f3e31f 100644 --- a/lib/local-execution/src/local-execution/ops/reshape.cc +++ b/lib/task-spec/src/task-spec/ops/reshape.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "local-execution/ops/reshape.h" +#include "task-spec/ops/reshape.h" #include "kernels/reshape_kernels.h" namespace FlexFlow { diff --git a/lib/local-execution/src/local-execution/ops/reverse.cc b/lib/task-spec/src/task-spec/ops/reverse.cc similarity index 98% rename from lib/local-execution/src/local-execution/ops/reverse.cc rename to lib/task-spec/src/task-spec/ops/reverse.cc index f3178e86ba..41739d086e 100644 --- a/lib/local-execution/src/local-execution/ops/reverse.cc +++ b/lib/task-spec/src/task-spec/ops/reverse.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "local-execution/ops/reverse.h" +#include "task-spec/ops/reverse.h" #include "kernels/accessor.h" #include "kernels/reverse_kernels.h" #include "utils/nonnegative_int/nonnegative_range.h" diff --git a/lib/local-execution/src/local-execution/ops/softmax.cc b/lib/task-spec/src/task-spec/ops/softmax.cc similarity index 99% rename from lib/local-execution/src/local-execution/ops/softmax.cc rename to lib/task-spec/src/task-spec/ops/softmax.cc index 4dedff6e18..d7b27fd884 100644 --- a/lib/local-execution/src/local-execution/ops/softmax.cc +++ b/lib/task-spec/src/task-spec/ops/softmax.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "local-execution/ops/softmax.h" +#include "task-spec/ops/softmax.h" #include "kernels/softmax_kernels.h" #include "op-attrs/parallel_tensor_shape.h" #include "utils/exception.h" diff --git a/lib/local-execution/src/local-execution/ops/split.cc b/lib/task-spec/src/task-spec/ops/split.cc similarity index 99% rename from lib/local-execution/src/local-execution/ops/split.cc rename to lib/task-spec/src/task-spec/ops/split.cc index 5661fa7381..a14f6a587d 100644 --- a/lib/local-execution/src/local-execution/ops/split.cc +++ b/lib/task-spec/src/task-spec/ops/split.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "local-execution/ops/split.h" +#include "task-spec/ops/split.h" #include "kernels/array_shape.h" #include "kernels/split_kernels.h" #include "utils/exception.h" diff --git a/lib/local-execution/src/local-execution/ops/topk.cc b/lib/task-spec/src/task-spec/ops/topk.cc similarity index 99% rename from lib/local-execution/src/local-execution/ops/topk.cc rename to lib/task-spec/src/task-spec/ops/topk.cc index fd895605a1..11f1fffa41 100644 --- a/lib/local-execution/src/local-execution/ops/topk.cc +++ b/lib/task-spec/src/task-spec/ops/topk.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "local-execution/ops/topk.h" +#include "task-spec/ops/topk.h" #include "kernels/topk_kernels.h" #include "utils/exception.h" diff --git a/lib/local-execution/src/local-execution/ops/transpose.cc b/lib/task-spec/src/task-spec/ops/transpose.cc similarity index 98% rename from lib/local-execution/src/local-execution/ops/transpose.cc rename to lib/task-spec/src/task-spec/ops/transpose.cc index c3de935d7c..b6a69b0ed7 100644 --- a/lib/local-execution/src/local-execution/ops/transpose.cc +++ b/lib/task-spec/src/task-spec/ops/transpose.cc @@ -13,7 +13,7 @@ * limitations under the License. */ -#include "local-execution/ops/transpose.h" +#include "task-spec/ops/transpose.h" #include "kernels/transpose_kernels.h" #include "op-attrs/ops/transpose.h" #include "utils/integer_conversions.h" diff --git a/lib/local-execution/src/local-execution/ops/weight.cc b/lib/task-spec/src/task-spec/ops/weight.cc similarity index 76% rename from lib/local-execution/src/local-execution/ops/weight.cc rename to lib/task-spec/src/task-spec/ops/weight.cc index f96c104f33..08c9be26e9 100644 --- a/lib/local-execution/src/local-execution/ops/weight.cc +++ b/lib/task-spec/src/task-spec/ops/weight.cc @@ -1,4 +1,4 @@ -#include "local-execution/ops/weight.h" +#include "task-spec/ops/weight.h" namespace FlexFlow { diff --git a/lib/task-spec/src/per_device_op_state.cc b/lib/task-spec/src/task-spec/per_device_op_state.cc similarity index 100% rename from lib/task-spec/src/per_device_op_state.cc rename to lib/task-spec/src/task-spec/per_device_op_state.cc diff --git a/lib/local-execution/src/permissions.cc b/lib/task-spec/src/task-spec/permissions.cc similarity index 97% rename from lib/local-execution/src/permissions.cc rename to lib/task-spec/src/task-spec/permissions.cc index 2286215987..8b5edb4df1 100644 --- a/lib/local-execution/src/permissions.cc +++ b/lib/task-spec/src/task-spec/permissions.cc @@ -1,4 +1,4 @@ -#include "local-execution/permissions.h" +#include "task-spec/permissions.h" #include "utils/exception.h" namespace FlexFlow { diff --git a/lib/task-spec/src/task-spec/privilege_tensor_accessor.cc b/lib/task-spec/src/task-spec/privilege_tensor_accessor.cc new file mode 100644 index 0000000000..a0c55b4dad --- /dev/null +++ b/lib/task-spec/src/task-spec/privilege_tensor_accessor.cc @@ -0,0 +1 @@ +#include "task-spec/privilege_tensor_accessor.h" diff --git a/lib/task-spec/src/runtime_arg_ref.cc b/lib/task-spec/src/task-spec/runtime_arg_ref.cc similarity index 100% rename from lib/task-spec/src/runtime_arg_ref.cc rename to lib/task-spec/src/task-spec/runtime_arg_ref.cc diff --git a/lib/task-spec/src/task_arg_spec.cc b/lib/task-spec/src/task-spec/task_arg_spec.cc similarity index 100% rename from lib/task-spec/src/task_arg_spec.cc rename to lib/task-spec/src/task-spec/task_arg_spec.cc diff --git a/lib/task-spec/src/task-spec/task_argument_accessor.cc b/lib/task-spec/src/task-spec/task_argument_accessor.cc new file mode 100644 index 0000000000..cee9fc0708 --- /dev/null +++ b/lib/task-spec/src/task-spec/task_argument_accessor.cc @@ -0,0 +1 @@ +#include "task-spec/task_argument_accessor.h" diff --git a/lib/task-spec/src/task_invocation.cc b/lib/task-spec/src/task-spec/task_invocation.cc similarity index 100% rename from lib/task-spec/src/task_invocation.cc rename to lib/task-spec/src/task-spec/task_invocation.cc diff --git a/lib/task-spec/src/task_signature.cc b/lib/task-spec/src/task-spec/task_signature.cc similarity index 100% rename from lib/task-spec/src/task_signature.cc rename to lib/task-spec/src/task-spec/task_signature.cc diff --git a/lib/local-execution/src/task_signature_impl.cc b/lib/task-spec/src/task-spec/task_signature_impl.cc similarity index 93% rename from lib/local-execution/src/task_signature_impl.cc rename to lib/task-spec/src/task-spec/task_signature_impl.cc index 9031d2a015..7995c0af0b 100644 --- a/lib/local-execution/src/task_signature_impl.cc +++ b/lib/task-spec/src/task-spec/task_signature_impl.cc @@ -1,33 +1,33 @@ -#include "local-execution/task_signature_impl.h" -#include "local-execution/ops/attention.h" -#include "local-execution/ops/batch_matmul.h" -#include "local-execution/ops/batch_norm.h" -#include "local-execution/ops/cast.h" -#include "local-execution/ops/combine.h" -#include "local-execution/ops/concat.h" -#include "local-execution/ops/conv_2d.h" -#include "local-execution/ops/dropout.h" -#include "local-execution/ops/element_binary.h" -#include "local-execution/ops/element_unary.h" -#include "local-execution/ops/embedding.h" -#include "local-execution/ops/flat.h" -#include "local-execution/ops/gather.h" -#include "local-execution/ops/input.h" -#include "local-execution/ops/layer_norm.h" -#include "local-execution/ops/linear.h" -#include "local-execution/ops/noop.h" -#include "local-execution/ops/pool_2d.h" -#include "local-execution/ops/reduce.h" -#include "local-execution/ops/reduction.h" -#include "local-execution/ops/repartition.h" -#include "local-execution/ops/replicate.h" -#include "local-execution/ops/reshape.h" -#include "local-execution/ops/reverse.h" -#include "local-execution/ops/softmax.h" -#include "local-execution/ops/split.h" -#include "local-execution/ops/topk.h" -#include "local-execution/ops/transpose.h" -#include "local-execution/ops/weight.h" +#include "task-spec/task_signature_impl.h" +#include "task-spec/ops/attention.h" +#include "task-spec/ops/batch_matmul.h" +#include "task-spec/ops/batch_norm.h" +#include "task-spec/ops/cast.h" +#include "task-spec/ops/combine.h" +#include "task-spec/ops/concat.h" +#include "task-spec/ops/conv_2d.h" +#include "task-spec/ops/dropout.h" +#include "task-spec/ops/element_binary.h" +#include "task-spec/ops/element_unary.h" +#include "task-spec/ops/embedding.h" +#include "task-spec/ops/flat.h" +#include "task-spec/ops/gather.h" +#include "task-spec/ops/input.h" +#include "task-spec/ops/layer_norm.h" +#include "task-spec/ops/linear.h" +#include "task-spec/ops/noop.h" +#include "task-spec/ops/pool_2d.h" +#include "task-spec/ops/reduce.h" +#include "task-spec/ops/reduction.h" +#include "task-spec/ops/repartition.h" +#include "task-spec/ops/replicate.h" +#include "task-spec/ops/reshape.h" +#include "task-spec/ops/reverse.h" +#include "task-spec/ops/softmax.h" +#include "task-spec/ops/split.h" +#include "task-spec/ops/topk.h" +#include "task-spec/ops/transpose.h" +#include "task-spec/ops/weight.h" #include "utils/overload.h" namespace FlexFlow { diff --git a/lib/task-spec/src/variadic_tensor_ref.cc b/lib/task-spec/src/task-spec/variadic_tensor_ref.cc similarity index 100% rename from lib/task-spec/src/variadic_tensor_ref.cc rename to lib/task-spec/src/task-spec/variadic_tensor_ref.cc diff --git a/lib/task-spec/test/CMakeLists.txt b/lib/task-spec/test/CMakeLists.txt new file mode 100644 index 0000000000..87abf10401 --- /dev/null +++ b/lib/task-spec/test/CMakeLists.txt @@ -0,0 +1,14 @@ +ff_add_test_executable( + NAME + task-spec-tests + SRC_PATTERNS + src/*.cc + PRIVATE_INCLUDE + src/ + DEPS + doctest + utils-test-common + local-execution + kernels + op-attrs +) diff --git a/lib/task-spec/test/src/task-spec/arg_ref.cc b/lib/task-spec/test/src/task-spec/arg_ref.cc new file mode 100644 index 0000000000..e1c5a9bd8d --- /dev/null +++ b/lib/task-spec/test/src/task-spec/arg_ref.cc @@ -0,0 +1,33 @@ +#include +#include "task-spec/arg_ref.h" +#include + +using namespace ::FlexFlow; + +enum class ExampleLabelType { + STRING, +}; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("ArgRefSpec::holds") { + CHECK_MESSAGE(false, "TODO: ArgRefSpec"); + + ArgRefSpec arg_ref_spec = ArgRefSpec::create( + ArgRef{ExampleLabelType::STRING} + ); + + SUBCASE("returns true if the type matches the ArgRef type") { + bool result = arg_ref_spec.holds(); + bool correct = true; + + CHECK(result == correct); + } + + SUBCASE("returns false otherwise") { + bool result = arg_ref_spec.holds(); + bool correct = false; + + CHECK(result == correct); + } + } +} From 292c61c754c85d2c310fe06b56b0716e467f1d2a Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Wed, 14 May 2025 18:13:26 -0700 Subject: [PATCH 71/91] Sync changes with Reyna --- lib/kernels/include/kernels/array_coord.h | 13 ++ .../kernels/compare_tensor_accessors.h | 35 +++ .../kernels/create_accessor_with_contents.h | 214 ++++++++++++++++++ .../kernels/managed_per_device_ff_handle.h | 7 +- .../include/kernels/map_tensor_accessors.h | 93 ++++++++ .../include/kernels/reduce_tensor_accessors.h | 39 ++++ lib/kernels/src/kernels/array_coord.cc | 20 ++ .../src/kernels/compare_tensor_accessors.cc | 50 ++++ .../kernels/create_accessor_with_contents.cc | 44 ++++ .../src/kernels/map_tensor_accessors.cc | 26 +++ .../test/src/cpu/ops/replicate_kernels.cc | 11 +- .../test/src/cpu/ops/reverse_kernels.cc | 40 ++-- lib/kernels/test/src/internal/test_utils.cc | 192 ---------------- lib/kernels/test/src/internal/test_utils.h | 26 --- lib/kernels/test/src/kernels/array_coord.cc | 44 ++++ .../src/kernels/compare_tensor_accessors.cc | 57 +++++ .../src/kernels/format_accessor_contents.cc | 9 +- lib/kernels/test/src/test_attention_kernel.cc | 5 +- .../test/src/test_batch_matmul_kernel.cc | 5 +- .../test/src/test_batch_norm_kernel.cc | 7 +- lib/kernels/test/src/test_combine_kernel.cc | 7 +- lib/kernels/test/src/test_concat_kernel.cc | 5 +- lib/kernels/test/src/test_dropout.cc | 5 +- lib/kernels/test/src/test_flat_kernel.cc | 5 +- lib/kernels/test/src/test_gather_kernels.cc | 5 +- .../test/src/test_layer_norm_kernels.cc | 5 +- .../test/src/test_managed_ff_stream.cc | 7 +- .../src/test_managed_per_device_ff_handle.cc | 15 +- lib/kernels/test/src/test_partition_kernel.cc | 5 +- lib/kernels/test/src/test_pool_2d_kernels.cc | 5 +- lib/kernels/test/src/test_reduction_kernel.cc | 5 +- lib/kernels/test/src/test_replicate_kernel.cc | 19 +- lib/kernels/test/src/test_reshape_kernel.cc | 5 +- lib/kernels/test/src/test_reverse_kernels.cc | 12 +- lib/kernels/test/src/test_softmax_kernel.cc | 5 +- lib/kernels/test/src/test_split_kernel.cc | 5 +- lib/kernels/test/src/test_transpose_kernel.cc | 5 +- .../local-execution/model_training_instance.h | 2 +- .../src/model_training_instance.cc | 4 +- lib/local-execution/test/src/test_e2e.cc | 34 +-- .../test/src/test_local_cost_estimator.cc | 6 +- .../test/src/test_loss_functions.cc | 5 +- lib/local-execution/test/src/test_update.cc | 5 +- lib/op-attrs/include/op-attrs/datatype.h | 60 +++-- 44 files changed, 846 insertions(+), 327 deletions(-) create mode 100644 lib/kernels/include/kernels/array_coord.h create mode 100644 lib/kernels/include/kernels/compare_tensor_accessors.h create mode 100644 lib/kernels/include/kernels/create_accessor_with_contents.h create mode 100644 lib/kernels/include/kernels/map_tensor_accessors.h create mode 100644 lib/kernels/include/kernels/reduce_tensor_accessors.h create mode 100644 lib/kernels/src/kernels/array_coord.cc create mode 100644 lib/kernels/src/kernels/compare_tensor_accessors.cc create mode 100644 lib/kernels/src/kernels/create_accessor_with_contents.cc create mode 100644 lib/kernels/src/kernels/map_tensor_accessors.cc create mode 100644 lib/kernels/test/src/kernels/array_coord.cc create mode 100644 lib/kernels/test/src/kernels/compare_tensor_accessors.cc diff --git a/lib/kernels/include/kernels/array_coord.h b/lib/kernels/include/kernels/array_coord.h new file mode 100644 index 0000000000..f739a3d707 --- /dev/null +++ b/lib/kernels/include/kernels/array_coord.h @@ -0,0 +1,13 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ARRAY_COORD_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ARRAY_COORD_H + +#include "kernels/array_coord.dtg.h" + +namespace FlexFlow { + +ArrayCoord array_coord_drop_dims(ArrayCoord const &, + std::function const &should_drop_dim); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/compare_tensor_accessors.h b/lib/kernels/include/kernels/compare_tensor_accessors.h new file mode 100644 index 0000000000..ee438505fb --- /dev/null +++ b/lib/kernels/include/kernels/compare_tensor_accessors.h @@ -0,0 +1,35 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_COMPARE_TENSOR_ACCESSORS_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_COMPARE_TENSOR_ACCESSORS_H + +#include "kernels/accessor.h" +#include "kernels/allocation.h" + +namespace FlexFlow { + +GenericTensorAccessorW compare_tensor_accessors_lt(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &allocator); + +GenericTensorAccessorW compare_tensor_accessors_le(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &allocator); + +GenericTensorAccessorW compare_tensor_accessors_gt(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &allocator); + +GenericTensorAccessorW compare_tensor_accessors_ge(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &allocator); + +GenericTensorAccessorW compare_tensor_accessors_eq(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &allocator); + +GenericTensorAccessorW compare_tensor_accessors_ne(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &allocator); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/create_accessor_with_contents.h b/lib/kernels/include/kernels/create_accessor_with_contents.h new file mode 100644 index 0000000000..fc07d432b2 --- /dev/null +++ b/lib/kernels/include/kernels/create_accessor_with_contents.h @@ -0,0 +1,214 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_CREATE_ACCESSOR_WITH_CONTENTS_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_CREATE_ACCESSOR_WITH_CONTENTS_H + +#include "kernels/accessor.h" +#include "kernels/allocation.h" +#include "kernels/local_cpu_allocator.h" +#include "utils/containers/require_all_same1.h" + +namespace FlexFlow { + +template +GenericTensorAccessorW + create_1d_accessor_w_with_contents(std::vector const &contents, + Allocator &allocator) { + nonnegative_int ncols = num_elements(contents); + ASSERT(ncols > 0); + + TensorShape shape = TensorShape{ + TensorDims{FFOrdered{ncols}}, + type_to_data_type_enum_v, + }; + + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape); + + for (nonnegative_int col_idx : nonnegative_range(ncols)) { + cpu_accessor.at>(FFOrdered{col_idx}) = + contents.at(col_idx.unwrap_nonnegative()); + } + + GenericTensorAccessorW result = allocator.allocate_tensor(shape); + copy_accessor_data_to_l_from_r( + result, read_only_accessor_from_write_accessor(cpu_accessor)); + + return result; +} + +template +GenericTensorAccessorW create_2d_accessor_w_with_contents( + std::vector> const &contents, Allocator &allocator) { + nonnegative_int nrows = num_elements(contents); + ASSERT(nrows > 0); + + nonnegative_int ncols = throw_if_unexpected( + require_all_same1(transform(contents, [](std::vector const &row) { + return num_elements(row); + }))); + ASSERT(ncols > 0); + + TensorShape shape = TensorShape{ + TensorDims{FFOrdered{nrows, ncols}}, + type_to_data_type_enum_v, + }; + + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape); + + for (nonnegative_int row_idx : nonnegative_range(nrows)) { + for (nonnegative_int col_idx : nonnegative_range(ncols)) { + cpu_accessor.at>(FFOrdered{row_idx, col_idx}) = + contents.at(row_idx.unwrap_nonnegative()) + .at(col_idx.unwrap_nonnegative()); + } + } + + GenericTensorAccessorW result = allocator.allocate_tensor(shape); + copy_accessor_data_to_l_from_r( + result, read_only_accessor_from_write_accessor(cpu_accessor)); + + return result; +} + +template +GenericTensorAccessorW create_3d_accessor_w_with_contents( + std::vector>> const &contents, + Allocator &allocator) { + nonnegative_int dim0_size = num_elements(contents); + ASSERT(dim0_size > 0); + + nonnegative_int dim1_size = throw_if_unexpected(require_all_same1( + transform(contents, [](std::vector> const &m) { + return num_elements(m); + }))); + ASSERT(dim1_size > 0); + + nonnegative_int dim2_size = throw_if_unexpected(require_all_same1( + transform(contents, [](std::vector> const &m) { + return throw_if_unexpected( + require_all_same1(transform(m, [](std::vector const &vec) { + return num_elements(vec); + }))); + }))); + ASSERT(dim2_size > 0); + + TensorShape shape = TensorShape{ + TensorDims{FFOrdered{dim0_size, dim1_size, dim2_size}}, + type_to_data_type_enum_v, + }; + + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape); + + for (nonnegative_int dim0_idx : nonnegative_range(dim0_size)) { + for (nonnegative_int dim1_idx : nonnegative_range(dim1_size)) { + for (nonnegative_int dim2_idx : nonnegative_range(dim2_size)) { + cpu_accessor.at>( + FFOrdered{dim0_idx, dim1_idx, dim2_idx}) = + contents.at(dim0_idx.unwrap_nonnegative()) + .at(dim1_idx.unwrap_nonnegative()) + .at(dim2_idx.unwrap_nonnegative()); + } + } + } + + GenericTensorAccessorW result = allocator.allocate_tensor(shape); + copy_accessor_data_to_l_from_r( + result, read_only_accessor_from_write_accessor(cpu_accessor)); + + return result; +} + +template +GenericTensorAccessorW create_4d_accessor_w_with_contents( + std::vector>>> const &contents, + Allocator &allocator) { + nonnegative_int dim0_size = num_elements(contents); + ASSERT(dim0_size > 0); + + nonnegative_int dim1_size = throw_if_unexpected(require_all_same1(transform( + contents, [](std::vector>> const &t) { + return num_elements(t); + }))); + ASSERT(dim1_size > 0); + + nonnegative_int dim2_size = throw_if_unexpected(require_all_same1(transform( + contents, [](std::vector>> const &m) { + return throw_if_unexpected(require_all_same1( + transform(m, [](std::vector> const &vec) { + return num_elements(vec); + }))); + }))); + ASSERT(dim2_size > 0); + + nonnegative_int dim3_size = throw_if_unexpected(require_all_same1(transform( + contents, [](std::vector>> const &t) { + return throw_if_unexpected(require_all_same1( + transform(t, [](std::vector> const &mat) { + return throw_if_unexpected(require_all_same1( + transform(mat, [](std::vector const &vec) { + return num_elements(vec); + }))); + }))); + }))); + ASSERT(dim3_size > 0); + + TensorShape shape = TensorShape{ + TensorDims{FFOrdered{dim0_size, dim1_size, dim2_size, dim3_size}}, + type_to_data_type_enum_v, + }; + + GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); + + for (nonnegative_int dim0_idx : nonnegative_range(dim0_size)) { + for (nonnegative_int dim1_idx : nonnegative_range(dim1_size)) { + for (nonnegative_int dim2_idx : nonnegative_range(dim2_size)) { + for (nonnegative_int dim3_idx : nonnegative_range(dim3_size)) { + accessor.at>( + FFOrdered{dim0_idx, dim1_idx, dim2_idx, dim3_idx}) = + contents.at(dim0_idx.unwrap_nonnegative()) + .at(dim1_idx.unwrap_nonnegative()) + .at(dim2_idx.unwrap_nonnegative()) + .at(dim3_idx.unwrap_nonnegative()); + } + } + } + } + + return accessor; +} + +template +GenericTensorAccessorR + create_1d_accessor_r_with_contents(std::vector const &contents, + Allocator &allocator) { + return read_only_accessor_from_write_accessor( + create_1d_accessor_w_with_contents(contents, allocator)); +} + +template +GenericTensorAccessorR create_2d_accessor_r_with_contents( + std::vector> const &contents, Allocator &allocator) { + return read_only_accessor_from_write_accessor( + create_2d_accessor_w_with_contents(contents, allocator)); +} + +template +GenericTensorAccessorR create_3d_accessor_r_with_contents( + std::vector>> const &contents, + Allocator &allocator) { + return read_only_accessor_from_write_accessor( + create_3d_accessor_w_with_contents(contents, allocator)); +} + +template +GenericTensorAccessorR create_4d_accessor_r_with_contents( + std::vector>>> const &contents, + Allocator &allocator) { + return read_only_accessor_from_write_accessor( + create_4d_accessor_w_with_contents(contents, allocator)); +} + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/managed_per_device_ff_handle.h b/lib/kernels/include/kernels/managed_per_device_ff_handle.h index 0226b1a76c..d409ec19ad 100644 --- a/lib/kernels/include/kernels/managed_per_device_ff_handle.h +++ b/lib/kernels/include/kernels/managed_per_device_ff_handle.h @@ -33,9 +33,12 @@ struct ManagedPerDeviceFFHandle { PerDeviceFFHandle *handle; }; -ManagedPerDeviceFFHandle initialize_single_gpu_handle(); +ManagedPerDeviceFFHandle initialize_single_gpu_handle(size_t workSpaceSize, + bool allowTensorOpMathConversion); ManagedPerDeviceFFHandle initialize_multi_gpu_handle(int num_ranks, - int my_rank); + int my_rank, + size_t workSpaceSize, + bool allowTensorOpMathConversion); } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/map_tensor_accessors.h b/lib/kernels/include/kernels/map_tensor_accessors.h new file mode 100644 index 0000000000..8447c60892 --- /dev/null +++ b/lib/kernels/include/kernels/map_tensor_accessors.h @@ -0,0 +1,93 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_MAP_TENSOR_ACCESSORS_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_MAP_TENSOR_ACCESSORS_H + +#include "kernels/accessor.h" +#include "kernels/allocation.h" +#include "kernels/local_cpu_allocator.h" +#include "kernels/copy_tensor_accessor.h" +#include "kernels/datatype_dispatch.h" +#include "utils/containers/require_same.h" +#include "utils/containers/require_all_same1.h" + +namespace FlexFlow { + +template +struct CPUMapTensorAccessor { + template + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + F &&f) { + ArrayShape shape = require_same(input.shape, output.shape); + + ASSERT(input.device_type == DeviceType::CPU); + ASSERT(output.device_type == DeviceType::CPU); + + for (ArrayCoord const &coord : get_array_coord_set(shape)) { + output.at(coord.ff_ordered) + = f(input.at
(coord.ff_ordered)); + } + } +}; + +template > +GenericTensorAccessorW map_tensor_accessor(GenericTensorAccessorR const &input, + Allocator &output_allocator, + F &&f) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR input_cpu = copy_tensor_accessor_r_to_cpu_if_necessary(input, cpu_allocator); + + GenericTensorAccessorW output_cpu = cpu_allocator.allocate_tensor(get_tensor_shape(input.shape, type_to_data_type_enum_v)); + + DataTypeDispatch1{}(input.data_type, input_cpu, output_cpu, f); + + return copy_tensor_accessor_w(output_cpu, output_allocator); +} + +template +struct CPUMapTensorAccessors2 { + template > + void operator()(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + GenericTensorAccessorW &output, + F &&f) { + + ArrayShape shape = throw_if_unexpected(require_all_same1(std::vector{ + lhs.shape, + rhs.shape, + output.shape, + })); + + ASSERT(lhs.device_type == DeviceType::CPU); + ASSERT(rhs.device_type == DeviceType::CPU); + ASSERT(output.device_type == DeviceType::CPU); + + for (ArrayCoord const &coord : get_array_coord_set(shape)) { + output.at>(coord.ff_ordered) + = f(lhs.at
(coord.ff_ordered), rhs.at
(coord.ff_ordered)); + } + } +}; + +template > +GenericTensorAccessorW map_tensor_accessors2(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &output_allocator, + F &&f) { + ArrayShape shape = require_same(lhs.shape, rhs.shape); + DataType input_data_type = require_same(lhs.data_type, rhs.data_type); + + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR lhs_cpu = copy_tensor_accessor_r_to_cpu_if_necessary(lhs, cpu_allocator); + GenericTensorAccessorR rhs_cpu = copy_tensor_accessor_r_to_cpu_if_necessary(rhs, cpu_allocator); + DataType output_data_type = type_to_data_type_enum_v; + GenericTensorAccessorW output_cpu = cpu_allocator.allocate_tensor(get_tensor_shape(shape, output_data_type)); + + DataTypeDispatch1{}(input_data_type, lhs_cpu, rhs_cpu, output_cpu, f); + + return copy_tensor_accessor_w(output_cpu, output_allocator); +} + + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/reduce_tensor_accessors.h b/lib/kernels/include/kernels/reduce_tensor_accessors.h new file mode 100644 index 0000000000..c80c41778f --- /dev/null +++ b/lib/kernels/include/kernels/reduce_tensor_accessors.h @@ -0,0 +1,39 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REDUCE_TENSOR_ACCESSORS_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REDUCE_TENSOR_ACCESSORS_H + +#include "kernels/accessor.h" +#include "kenrels/allocation.h" + +namespace FlexFlow { + + + +template +struct CPUReduceTensorAccessorInDims { + template + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + std::unordered_set const &dims_to_reduce, + F &&f) { + + ASSERT(input.device_type == DeviceType::CPU); + ASSERT(output.device_type == DeviceType::CPU); + + for (ArrayCoord const &coord : get_array_coord_set(input.shape)) { + output.at>(coord) + } + } +}; + +template +GenericTensorAccessorW reduce_tensor_accessor_in_dims(std::unordered_set const &dims, + F &&f) { + +} + +GenericTensorAccessorW reduce_tensor_accessor_all(GenericTensorAcessorR const &input, + Allocator &allocator); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/src/kernels/array_coord.cc b/lib/kernels/src/kernels/array_coord.cc new file mode 100644 index 0000000000..60bb19351c --- /dev/null +++ b/lib/kernels/src/kernels/array_coord.cc @@ -0,0 +1,20 @@ +#include "kernels/array_coord.h" +#include "op-attrs/ff_ordered/ff_ordered_of.h" +#include "op-attrs/ff_ordered/get_idxs.h" +#include + +namespace FlexFlow { + +ArrayCoord array_coord_drop_dims(ArrayCoord const &coord, + std::function const &should_drop_dim) { + std::vector result; + for (ff_dim_t idx : get_idxs(coord.ff_ordered)) { + if (!should_drop_dim(idx)) { + result.push_back(coord.ff_ordered.at(idx)); + } + } + + return ArrayCoord{ff_ordered_of(result)}; +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/compare_tensor_accessors.cc b/lib/kernels/src/kernels/compare_tensor_accessors.cc new file mode 100644 index 0000000000..4594fed322 --- /dev/null +++ b/lib/kernels/src/kernels/compare_tensor_accessors.cc @@ -0,0 +1,50 @@ +#include "kernels/compare_tensor_accessors.h" +#include "kernels/map_tensor_accessors.h" + +namespace FlexFlow { + +GenericTensorAccessorW compare_tensor_accessors_lt(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &output_allocator) { + return map_tensor_accessors2(lhs, rhs, output_allocator, + [](auto const &l, auto const &r) { return l < r; }); +} + +GenericTensorAccessorW compare_tensor_accessors_le(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &output_allocator) { + return map_tensor_accessors2(lhs, rhs, output_allocator, + [](auto const &l, auto const &r) { return l <= r; }); +} + + +GenericTensorAccessorW compare_tensor_accessors_gt(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &output_allocator) { + return map_tensor_accessors2(lhs, rhs, output_allocator, + [](auto const &l, auto const &r) { return l > r; }); +} + +GenericTensorAccessorW compare_tensor_accessors_ge(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &output_allocator) { + return map_tensor_accessors2(lhs, rhs, output_allocator, + [](auto const &l, auto const &r) { return l >= r; }); +} + +GenericTensorAccessorW compare_tensor_accessors_eq(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &output_allocator) { + return map_tensor_accessors2(lhs, rhs, output_allocator, + [](auto const &l, auto const &r) { return l == r; }); +} + + +GenericTensorAccessorW compare_tensor_accessors_ne(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &output_allocator) { + return map_tensor_accessors2(lhs, rhs, output_allocator, + [](auto const &l, auto const &r) { return l != r; }); +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/create_accessor_with_contents.cc b/lib/kernels/src/kernels/create_accessor_with_contents.cc new file mode 100644 index 0000000000..f8b85baa4a --- /dev/null +++ b/lib/kernels/src/kernels/create_accessor_with_contents.cc @@ -0,0 +1,44 @@ +#include "kernels/create_accessor_with_contents.h" + +namespace FlexFlow { + +template + GenericTensorAccessorW + create_1d_accessor_w_with_contents(std::vector const &, + Allocator &); + +template + GenericTensorAccessorW create_2d_accessor_w_with_contents( + std::vector> const &, Allocator &); + +template + GenericTensorAccessorW create_3d_accessor_w_with_contents( + std::vector>> const &, + Allocator &); + +template + GenericTensorAccessorW create_4d_accessor_w_with_contents( + std::vector>>> const &, + Allocator &); + +template + GenericTensorAccessorR + create_1d_accessor_r_with_contents(std::vector const &, + Allocator &); + +template + GenericTensorAccessorR create_2d_accessor_r_with_contents( + std::vector> const &, Allocator &); + +template + GenericTensorAccessorR create_3d_accessor_r_with_contents( + std::vector>> const &, + Allocator &); + +template + GenericTensorAccessorR create_4d_accessor_r_with_contents( + std::vector>>> const &, + Allocator &); + + +} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/map_tensor_accessors.cc b/lib/kernels/src/kernels/map_tensor_accessors.cc new file mode 100644 index 0000000000..619f1cc412 --- /dev/null +++ b/lib/kernels/src/kernels/map_tensor_accessors.cc @@ -0,0 +1,26 @@ +#include "kernels/map_tensor_accessors.h" + +namespace FlexFlow { + +struct F1 { + template + float operator()(T const &t) const { NOT_IMPLEMENTED(); } +}; + +template +GenericTensorAccessorW map_tensor_accessor(GenericTensorAccessorR const &, + Allocator &, + F1 &&); + +struct F2 { + template + float operator()(T const &lhs, T const &rhs) const { NOT_IMPLEMENTED(); } +}; + +template + GenericTensorAccessorW map_tensor_accessors2(GenericTensorAccessorR const &, + GenericTensorAccessorR const &, + Allocator &, + F2 &&); + +} // namespace FlexFlow diff --git a/lib/kernels/test/src/cpu/ops/replicate_kernels.cc b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc index 8630dcd8cd..6c35185524 100644 --- a/lib/kernels/test/src/cpu/ops/replicate_kernels.cc +++ b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc @@ -1,4 +1,5 @@ #include "internal/test_utils.h" +#include "kernels/create_accessor_with_contents.h" #include "kernels/format_accessor_contents.h" #include "kernels/replicate_kernels_cpu.h" #include "test/utils/doctest/check_kv.h" @@ -11,11 +12,11 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); GenericTensorAccessorR input = - create_1d_accessor_r_with_contents({1, 3, 2}, cpu_allocator); + create_1d_accessor_r_with_contents({1, 3, 2}, cpu_allocator); TensorShape result_shape = TensorShape{ TensorDims{FFOrdered{3_n}}, - DataType::FLOAT, + DataType::INT32, }; GenericTensorAccessorW result = create_zero_filled_accessor_w(result_shape, cpu_allocator); @@ -32,7 +33,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Replicate::cpu_backward_kernel") { Allocator cpu_allocator = create_local_cpu_memory_allocator(); - GenericTensorAccessorR output = create_2d_accessor_r_with_contents( + GenericTensorAccessorR output = create_2d_accessor_r_with_contents( { {1, 2, 3}, {4, 3, 3}, @@ -40,12 +41,12 @@ TEST_SUITE(FF_TEST_SUITE) { }, cpu_allocator); - GenericTensorAccessorR correct = create_1d_accessor_r_with_contents( + GenericTensorAccessorR correct = create_1d_accessor_r_with_contents( {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator); TensorShape result_shape = TensorShape{ TensorDims{FFOrdered{3_n}}, - DataType::FLOAT, + DataType::INT32, }; GenericTensorAccessorW result = create_zero_filled_accessor_w(result_shape, cpu_allocator); diff --git a/lib/kernels/test/src/cpu/ops/reverse_kernels.cc b/lib/kernels/test/src/cpu/ops/reverse_kernels.cc index db0016cb0b..8c54f4453b 100644 --- a/lib/kernels/test/src/cpu/ops/reverse_kernels.cc +++ b/lib/kernels/test/src/cpu/ops/reverse_kernels.cc @@ -1,7 +1,9 @@ #include "internal/test_utils.h" #include "kernels/format_accessor_contents.h" #include "kernels/reverse_kernels_cpu.h" +#include "kernels/create_accessor_with_contents.h" #include +#include "test/utils/doctest/check_kv.h" using namespace ::FlexFlow; @@ -9,7 +11,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Reverse::cpu_forward_kernel") { Allocator cpu_allocator = create_local_cpu_memory_allocator(); - GenericTensorAccessorR input = create_3d_accessor_r_with_contents( + GenericTensorAccessorR input = create_3d_accessor_r_with_contents( { { {1, 3, 2}, @@ -25,7 +27,7 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorW result = create_zero_filled_accessor_w( TensorShape{ TensorDims{FFOrdered{2_n, 2_n, 3_n}}, - DataType::FLOAT, + DataType::INT32, }, cpu_allocator); @@ -34,7 +36,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*axis=*/ff_dim_t{0_n}, }; - GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( + GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( { { {3, 3, 6}, @@ -50,8 +52,7 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Reverse::cpu_forward_kernel(input, result, attrs); CHECK_MESSAGE(accessors_are_equal(result, correct), - "result=", - format_accessor_w_contents(result)); + check_kv("result=", format_accessor_w_contents(result))); } SUBCASE("axis = ff_dim_t{1}") { @@ -59,7 +60,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*axis=*/ff_dim_t{1_n}, }; - GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( + GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( { { {4, 2, 1}, @@ -75,8 +76,7 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Reverse::cpu_forward_kernel(input, result, attrs); CHECK_MESSAGE(accessors_are_equal(result, correct), - "result=", - format_accessor_w_contents(result)); + check_kv("result", format_accessor_w_contents(result))); } SUBCASE("axis = ff_dim_t{2}") { @@ -84,7 +84,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*axis=*/ff_dim_t{2_n}, }; - GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( + GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( { { {2, 3, 1}, @@ -100,15 +100,14 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Reverse::cpu_forward_kernel(input, result, attrs); CHECK_MESSAGE(accessors_are_equal(result, correct), - "result=", - format_accessor_w_contents(result)); + check_kv("result", format_accessor_w_contents(result))); } } TEST_CASE("Reverse::cpu_backward_kernel") { Allocator cpu_allocator = create_local_cpu_memory_allocator(); - GenericTensorAccessorR input = create_3d_accessor_r_with_contents( + GenericTensorAccessorR input = create_3d_accessor_r_with_contents( { { {1, 3, 2}, @@ -124,7 +123,7 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorW result = create_zero_filled_accessor_w( TensorShape{ TensorDims{FFOrdered{2_n, 2_n, 3_n}}, - DataType::FLOAT, + DataType::INT32, }, cpu_allocator); @@ -133,7 +132,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*axis=*/ff_dim_t{0_n}, }; - GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( + GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( { { {3, 3, 6}, @@ -149,8 +148,7 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Reverse::cpu_forward_kernel(input, result, attrs); CHECK_MESSAGE(accessors_are_equal(result, correct), - "result=", - format_accessor_w_contents(result)); + check_kv("result", format_accessor_w_contents(result))); } SUBCASE("axis = ff_dim_t{1}") { @@ -158,7 +156,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*axis=*/ff_dim_t{1_n}, }; - GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( + GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( { { {4, 2, 1}, @@ -174,8 +172,7 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Reverse::cpu_forward_kernel(input, result, attrs); CHECK_MESSAGE(accessors_are_equal(result, correct), - "result=", - format_accessor_w_contents(result)); + check_kv("result", format_accessor_w_contents(result))); } SUBCASE("axis = ff_dim_t{2}") { @@ -183,7 +180,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*axis=*/ff_dim_t{2_n}, }; - GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( + GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( { { {2, 3, 1}, @@ -199,8 +196,7 @@ TEST_SUITE(FF_TEST_SUITE) { Kernels::Reverse::cpu_forward_kernel(input, result, attrs); CHECK_MESSAGE(accessors_are_equal(result, correct), - "result=", - format_accessor_w_contents(result)); + check_kv("result", format_accessor_w_contents(result))); } } } diff --git a/lib/kernels/test/src/internal/test_utils.cc b/lib/kernels/test/src/internal/test_utils.cc index 0f34a6aa06..b20ea8ee6b 100644 --- a/lib/kernels/test/src/internal/test_utils.cc +++ b/lib/kernels/test/src/internal/test_utils.cc @@ -20,198 +20,6 @@ GenericTensorAccessorR create_zero_filled_accessor_r(TensorShape const &shape, return read_only_accessor_from_write_accessor(accessor); } -GenericTensorAccessorW - create_1d_accessor_w_with_contents(std::vector const &contents, - Allocator &allocator) { - nonnegative_int ncols = num_elements(contents); - ASSERT(ncols > 0); - - TensorShape shape = TensorShape{ - TensorDims{FFOrdered{ncols}}, - DataType::FLOAT, - }; - - Allocator cpu_allocator = create_local_cpu_memory_allocator(); - GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape); - - for (nonnegative_int col_idx : nonnegative_range(ncols)) { - cpu_accessor.at(FFOrdered{col_idx}) = - contents.at(col_idx.unwrap_nonnegative()); - } - - GenericTensorAccessorW result = allocator.allocate_tensor(shape); - copy_accessor_data_to_l_from_r( - result, read_only_accessor_from_write_accessor(cpu_accessor)); - - return result; -} - -GenericTensorAccessorW create_2d_accessor_w_with_contents( - std::vector> const &contents, Allocator &allocator) { - nonnegative_int nrows = num_elements(contents); - ASSERT(nrows > 0); - - nonnegative_int ncols = throw_if_unexpected( - require_all_same1(transform(contents, [](std::vector const &row) { - return num_elements(row); - }))); - ASSERT(ncols > 0); - - TensorShape shape = TensorShape{ - TensorDims{FFOrdered{nrows, ncols}}, - DataType::FLOAT, - }; - - Allocator cpu_allocator = create_local_cpu_memory_allocator(); - GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape); - - for (nonnegative_int row_idx : nonnegative_range(nrows)) { - for (nonnegative_int col_idx : nonnegative_range(ncols)) { - cpu_accessor.at(FFOrdered{row_idx, col_idx}) = - contents.at(row_idx.unwrap_nonnegative()) - .at(col_idx.unwrap_nonnegative()); - } - } - - GenericTensorAccessorW result = allocator.allocate_tensor(shape); - copy_accessor_data_to_l_from_r( - result, read_only_accessor_from_write_accessor(cpu_accessor)); - - return result; -} - -GenericTensorAccessorW create_3d_accessor_w_with_contents( - std::vector>> const &contents, - Allocator &allocator) { - nonnegative_int dim0_size = num_elements(contents); - ASSERT(dim0_size > 0); - - nonnegative_int dim1_size = throw_if_unexpected(require_all_same1( - transform(contents, [](std::vector> const &m) { - return num_elements(m); - }))); - ASSERT(dim1_size > 0); - - nonnegative_int dim2_size = throw_if_unexpected(require_all_same1( - transform(contents, [](std::vector> const &m) { - return throw_if_unexpected( - require_all_same1(transform(m, [](std::vector const &vec) { - return num_elements(vec); - }))); - }))); - ASSERT(dim2_size > 0); - - TensorShape shape = TensorShape{ - TensorDims{FFOrdered{dim0_size, dim1_size, dim2_size}}, - DataType::FLOAT, - }; - - Allocator cpu_allocator = create_local_cpu_memory_allocator(); - GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape); - - for (nonnegative_int dim0_idx : nonnegative_range(dim0_size)) { - for (nonnegative_int dim1_idx : nonnegative_range(dim1_size)) { - for (nonnegative_int dim2_idx : nonnegative_range(dim2_size)) { - cpu_accessor.at( - FFOrdered{dim0_idx, dim1_idx, dim2_idx}) = - contents.at(dim0_idx.unwrap_nonnegative()) - .at(dim1_idx.unwrap_nonnegative()) - .at(dim2_idx.unwrap_nonnegative()); - } - } - } - - GenericTensorAccessorW result = allocator.allocate_tensor(shape); - copy_accessor_data_to_l_from_r( - result, read_only_accessor_from_write_accessor(cpu_accessor)); - - return result; -} - -GenericTensorAccessorW create_4d_accessor_w_with_contents( - std::vector>>> const &contents, - Allocator &allocator) { - nonnegative_int dim0_size = num_elements(contents); - ASSERT(dim0_size > 0); - - nonnegative_int dim1_size = throw_if_unexpected(require_all_same1(transform( - contents, [](std::vector>> const &t) { - return num_elements(t); - }))); - ASSERT(dim1_size > 0); - - nonnegative_int dim2_size = throw_if_unexpected(require_all_same1(transform( - contents, [](std::vector>> const &m) { - return throw_if_unexpected(require_all_same1( - transform(m, [](std::vector> const &vec) { - return num_elements(vec); - }))); - }))); - ASSERT(dim2_size > 0); - - nonnegative_int dim3_size = throw_if_unexpected(require_all_same1(transform( - contents, [](std::vector>> const &t) { - return throw_if_unexpected(require_all_same1( - transform(t, [](std::vector> const &mat) { - return throw_if_unexpected(require_all_same1( - transform(mat, [](std::vector const &vec) { - return num_elements(vec); - }))); - }))); - }))); - ASSERT(dim3_size > 0); - - TensorShape shape = TensorShape{ - TensorDims{FFOrdered{dim0_size, dim1_size, dim2_size, dim3_size}}, - DataType::FLOAT, - }; - - GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - - for (nonnegative_int dim0_idx : nonnegative_range(dim0_size)) { - for (nonnegative_int dim1_idx : nonnegative_range(dim1_size)) { - for (nonnegative_int dim2_idx : nonnegative_range(dim2_size)) { - for (nonnegative_int dim3_idx : nonnegative_range(dim3_size)) { - accessor.at( - FFOrdered{dim0_idx, dim1_idx, dim2_idx, dim3_idx}) = - contents.at(dim0_idx.unwrap_nonnegative()) - .at(dim1_idx.unwrap_nonnegative()) - .at(dim2_idx.unwrap_nonnegative()) - .at(dim3_idx.unwrap_nonnegative()); - } - } - } - } - - return accessor; -} - -GenericTensorAccessorR - create_1d_accessor_r_with_contents(std::vector const &contents, - Allocator &allocator) { - return read_only_accessor_from_write_accessor( - create_1d_accessor_w_with_contents(contents, allocator)); -} - -GenericTensorAccessorR create_2d_accessor_r_with_contents( - std::vector> const &contents, Allocator &allocator) { - return read_only_accessor_from_write_accessor( - create_2d_accessor_w_with_contents(contents, allocator)); -} - -GenericTensorAccessorR create_3d_accessor_r_with_contents( - std::vector>> const &contents, - Allocator &allocator) { - return read_only_accessor_from_write_accessor( - create_3d_accessor_w_with_contents(contents, allocator)); -} - -GenericTensorAccessorR create_4d_accessor_r_with_contents( - std::vector>>> const &contents, - Allocator &allocator) { - return read_only_accessor_from_write_accessor( - create_4d_accessor_w_with_contents(contents, allocator)); -} template struct CreateRandomFilledAccessorW { diff --git a/lib/kernels/test/src/internal/test_utils.h b/lib/kernels/test/src/internal/test_utils.h index a4fc9b88c8..9147b667d6 100644 --- a/lib/kernels/test/src/internal/test_utils.h +++ b/lib/kernels/test/src/internal/test_utils.h @@ -29,32 +29,6 @@ GenericTensorAccessorW create_zero_filled_accessor_w(TensorShape const &shape, GenericTensorAccessorR create_zero_filled_accessor_r(TensorShape const &shape, Allocator &allocator); -GenericTensorAccessorW - create_1d_accessor_w_with_contents(std::vector const &contents, - Allocator &allocator); -GenericTensorAccessorR - create_1d_accessor_r_with_contents(std::vector const &contents, - Allocator &allocator); - -GenericTensorAccessorW create_2d_accessor_w_with_contents( - std::vector> const &contents, Allocator &allocator); -GenericTensorAccessorR create_2d_accessor_r_with_contents( - std::vector> const &contents, Allocator &allocator); - -GenericTensorAccessorW create_3d_accessor_w_with_contents( - std::vector>> const &contents, - Allocator &allocator); -GenericTensorAccessorR create_3d_accessor_r_with_contents( - std::vector>> const &contents, - Allocator &allocator); - -GenericTensorAccessorW create_4d_accessor_w_with_contents( - std::vector>>> const &contents, - Allocator &allocator); -GenericTensorAccessorR create_4d_accessor_r_with_contents( - std::vector>>> const &contents, - Allocator &allocator); - bool contains_non_zero(GenericTensorAccessorR const &accessor); void fill_with_zeros(GenericTensorAccessorW const &accessor); diff --git a/lib/kernels/test/src/kernels/array_coord.cc b/lib/kernels/test/src/kernels/array_coord.cc new file mode 100644 index 0000000000..128b746a87 --- /dev/null +++ b/lib/kernels/test/src/kernels/array_coord.cc @@ -0,0 +1,44 @@ +#include +#include "kernels/array_coord.h" + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("array_coord_drop_dims") { + ArrayCoord coord = ArrayCoord{ + FFOrdered{3_n, 5_n, 0_n, 1_n}, + }; + + SUBCASE("removes dims specified to be dropped") { + std::function should_drop_dim + = [](ff_dim_t d) { return d.value % 2_n == 0_n; }; + + ArrayCoord result = array_coord_drop_dims(coord, should_drop_dim); + ArrayCoord correct = ArrayCoord{ + FFOrdered{5_n, 1_n}, + }; + + CHECK(result == correct); + } + + SUBCASE("is identity function if no dimensions are specified to be dropped") { + std::function should_drop_dim + = [](ff_dim_t d) { return false; }; + + ArrayCoord result = array_coord_drop_dims(coord, should_drop_dim); + ArrayCoord correct = coord; + + CHECK(result == correct); + } + + SUBCASE("returns empty coord if all dimensions are specified to be dropped") { + std::function should_drop_dim + = [](ff_dim_t d) { return true; }; + + ArrayCoord result = array_coord_drop_dims(coord, should_drop_dim); + ArrayCoord correct = ArrayCoord{FFOrdered{}}; + + CHECK(result == correct); + } + } +} diff --git a/lib/kernels/test/src/kernels/compare_tensor_accessors.cc b/lib/kernels/test/src/kernels/compare_tensor_accessors.cc new file mode 100644 index 0000000000..d5124180af --- /dev/null +++ b/lib/kernels/test/src/kernels/compare_tensor_accessors.cc @@ -0,0 +1,57 @@ +#include "internal/test_utils.h" +#include +#include "kernels/compare_tensor_accessors.h" +#include "kernels/create_accessor_with_contents.h" +#include "kernels/format_accessor_contents.h" +#include "test/utils/doctest/check_kv.h" + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("compare_tensor_accessors_lt") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorR lhs = create_3d_accessor_r_with_contents( + { + { + {1, 3, 2}, + {4, 2, 1}, + }, + { + {3, 3, 6}, + {2, 1, 5}, + }, + }, + cpu_allocator); + + GenericTensorAccessorR rhs = create_3d_accessor_r_with_contents( + { + { + {2, 3, 3}, + {5, 1, 0}, + }, + { + {1, 5, 4}, + {2, 1, 5}, + }, + }, + cpu_allocator); + + GenericTensorAccessorW result = compare_tensor_accessors_lt(lhs, rhs, cpu_allocator); + GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( + { + { + {true, false, true}, + {true, false, false}, + }, + { + {false, true, false}, + {false, false, false}, + }, + }, + cpu_allocator); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + check_kv("result", format_accessor_w_contents(result))); + } +} diff --git a/lib/kernels/test/src/kernels/format_accessor_contents.cc b/lib/kernels/test/src/kernels/format_accessor_contents.cc index 915a84c335..a7f2bed5ba 100644 --- a/lib/kernels/test/src/kernels/format_accessor_contents.cc +++ b/lib/kernels/test/src/kernels/format_accessor_contents.cc @@ -1,6 +1,7 @@ #include "kernels/format_accessor_contents.h" #include "internal/test_utils.h" #include "kernels/local_cpu_allocator.h" +#include "kernels/create_accessor_with_contents.h" #include using namespace ::FlexFlow; @@ -11,7 +12,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("accessor is 1d") { GenericTensorAccessorR accessor = - create_1d_accessor_r_with_contents({1, 2, 3, 2}, cpu_allocator); + create_1d_accessor_r_with_contents({1, 2, 3, 2}, cpu_allocator); std::string correct = "[1 2 3 2]"; @@ -21,7 +22,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("accessor is 2d") { - GenericTensorAccessorR accessor = create_2d_accessor_r_with_contents( + GenericTensorAccessorR accessor = create_2d_accessor_r_with_contents( { {1, 2, 3, 5}, {4, 3, 3, 2}, @@ -41,7 +42,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("accessor is 3d") { - GenericTensorAccessorR accessor = create_3d_accessor_r_with_contents( + GenericTensorAccessorR accessor = create_3d_accessor_r_with_contents( { { {1, 2, 3, 6}, @@ -86,7 +87,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("accessor is some other dimension") { GenericTensorAccessorR accessor = - create_4d_accessor_r_with_contents({{{{5}}}}, cpu_allocator); + create_4d_accessor_r_with_contents({{{{5}}}}, cpu_allocator); CHECK_THROWS(format_accessor_r_contents(accessor)); } diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc index f89121f5c6..3a0f4ffdc4 100644 --- a/lib/kernels/test/src/test_attention_kernel.cc +++ b/lib/kernels/test/src/test_attention_kernel.cc @@ -19,7 +19,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { nonnegative_int kvSeqLength = 20_n; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true + ); Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc index 2d98736c38..e10a80b57f 100644 --- a/lib/kernels/test/src/test_batch_matmul_kernel.cc +++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc @@ -15,7 +15,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { int seq_length = -1; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true + ); Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index 86c0b7a685..c9a1bf05e6 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -5,7 +5,7 @@ using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test BatchNorm Kernel") { nonnegative_int output_n = 1_n; nonnegative_int output_c = 10_n; @@ -13,7 +13,10 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int output_w = 10_n; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true + ); Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc index 2040dcbd5d..ddcb0d8c49 100644 --- a/lib/kernels/test/src/test_combine_kernel.cc +++ b/lib/kernels/test/src/test_combine_kernel.cc @@ -6,9 +6,10 @@ using namespace ::FlexFlow; TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Call Combine Forward and Backward Kernels") { - ManagedPerDeviceFFHandle managed_handle{ - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true + ); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc index c8d74c32ab..20ebb52161 100644 --- a/lib/kernels/test/src/test_concat_kernel.cc +++ b/lib/kernels/test/src/test_concat_kernel.cc @@ -6,7 +6,10 @@ using namespace ::FlexFlow; TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test concat kernel forward and backward") { - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true + ); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index 3a7ce8fac1..8379e062d5 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -20,7 +20,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { TensorShape output_shape = input_shape; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true + ); Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc index afe3e9793d..dd44b8f50c 100644 --- a/lib/kernels/test/src/test_flat_kernel.cc +++ b/lib/kernels/test/src/test_flat_kernel.cc @@ -8,7 +8,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Flat Kernel") { Allocator allocator = create_local_cuda_memory_allocator(); - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true + ); ManagedFFStream managed_stream{}; TensorShape input_shape = TensorShape{ diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc index 1ed64020ec..c387899709 100644 --- a/lib/kernels/test/src/test_gather_kernels.cc +++ b/lib/kernels/test/src/test_gather_kernels.cc @@ -6,7 +6,10 @@ using namespace ::FlexFlow; TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Gather Forward and Backward Kernel") { - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true + ); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc index b8d9d725cf..eb62784369 100644 --- a/lib/kernels/test/src/test_layer_norm_kernels.cc +++ b/lib/kernels/test/src/test_layer_norm_kernels.cc @@ -22,7 +22,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { DataType::FLOAT, }; - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true + ); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc index fb5920adcc..9243601766 100644 --- a/lib/kernels/test/src/test_managed_ff_stream.cc +++ b/lib/kernels/test/src/test_managed_ff_stream.cc @@ -6,9 +6,10 @@ using namespace ::FlexFlow; TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test ManagedFFStream") { - ManagedPerDeviceFFHandle managed_handle{ - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true + ); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc index fc67764cdb..058622e5cb 100644 --- a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc +++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc @@ -5,8 +5,12 @@ using namespace ::FlexFlow; TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test ManagedPerDeviceFFHandle") { - ManagedPerDeviceFFHandle base_handle{/*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true}; + ManagedPerDeviceFFHandle base_handle{ + /*num_ranks=*/1, + /*my_rank=*/0, + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true, + }; PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle(); SUBCASE("constructor") { @@ -22,8 +26,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { SUBCASE("move assignment operator") { SUBCASE("move assign to other") { ManagedPerDeviceFFHandle new_handle{ - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true}; + /*num_ranks=*/1, + /*my_rank=*/0, + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true, + }; new_handle = std::move(base_handle); CHECK(&new_handle.raw_handle() == base_handle_ptr); } diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc index b32368eb29..283b465abc 100644 --- a/lib/kernels/test/src/test_partition_kernel.cc +++ b/lib/kernels/test/src/test_partition_kernel.cc @@ -7,7 +7,10 @@ using namespace ::FlexFlow; TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Partition Forward and Backward") { - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true + ); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index 4098456b8d..ceca1d94dd 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -23,7 +23,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { PoolOp pool_type = PoolOp::MAX; - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true + ); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc index 16d8556bb3..b7990d84fa 100644 --- a/lib/kernels/test/src/test_reduction_kernel.cc +++ b/lib/kernels/test/src/test_reduction_kernel.cc @@ -13,7 +13,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { DataType::FLOAT, }; - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true + ); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index 69dbe672ac..ceb0915c03 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -1,4 +1,5 @@ #include "internal/test_utils.h" +#include "kernels/create_accessor_with_contents.h" #include "kernels/format_accessor_contents.h" #include "kernels/replicate_kernels.h" #include "kernels/replicate_kernels_cpu.h" @@ -20,7 +21,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { DataType::FLOAT, }; - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true + ); ManagedFFStream managed_stream{}; Allocator gpu_allocator = create_local_cuda_memory_allocator(); @@ -28,7 +32,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input = - create_1d_accessor_r_with_contents({1, 3, 2}, gpu_allocator); + create_1d_accessor_r_with_contents({1, 3, 2}, gpu_allocator); GenericTensorAccessorW output = gpu_allocator.allocate_tensor(output_shape); @@ -43,7 +47,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { } SUBCASE("backward_kernel") { - GenericTensorAccessorR output_grad = create_2d_accessor_r_with_contents( + GenericTensorAccessorR output_grad = create_2d_accessor_r_with_contents( { {1, 2, 3}, {4, 3, 3}, @@ -51,7 +55,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { }, gpu_allocator); - GenericTensorAccessorR correct = create_1d_accessor_r_with_contents( + GenericTensorAccessorR correct = create_1d_accessor_r_with_contents( {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator); GenericTensorAccessorW input_grad = @@ -80,9 +84,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { DataType::FLOAT, }; - ManagedPerDeviceFFHandle managed_handle{ - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true + ); ManagedFFStream managed_stream{}; Allocator gpu_allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc index c63a69f76e..69f0a1f214 100644 --- a/lib/kernels/test/src/test_reshape_kernel.cc +++ b/lib/kernels/test/src/test_reshape_kernel.cc @@ -5,7 +5,10 @@ using namespace ::FlexFlow; TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Reshape Forward and Backward") { - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true + ); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index 87ac4e6713..f2ddb2c67b 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -13,7 +13,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { }; TensorShape output_shape = input_shape; - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true + ); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -57,9 +60,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { }; TensorShape output_shape = input_shape; - ManagedPerDeviceFFHandle managed_handle{ - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true + ); ManagedFFStream managed_stream{}; Allocator gpu_allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index e2a220d24a..0d5dcb79a2 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -12,7 +12,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { nonnegative_int input_w = 100_n; nonnegative_int channels = 100_n; - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true + ); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index a623946972..d8ddb8c4b9 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -13,7 +13,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { coord_t in_blk_size = 100; coord_t num_blks = 1; - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true + ); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index b5f80956fa..e2042c1e2c 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -12,7 +12,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { }, }; - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true + ); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h index 2deed6b0a2..6f8f4b1543 100644 --- a/lib/local-execution/include/local-execution/model_training_instance.h +++ b/lib/local-execution/include/local-execution/model_training_instance.h @@ -30,7 +30,7 @@ struct ModelTrainingInstance { PerLayerElapsedTime forward(); PerLayerElapsedTime backward(); void update(); - void write_loss_tensor_to_host(float *host_ptr); + GenericTensorAccessorR get_loss_tensor_accessor() const; }; } // namespace FlexFlow diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc index e58b5dfe7d..790c5e8e18 100644 --- a/lib/local-execution/src/model_training_instance.cc +++ b/lib/local-execution/src/model_training_instance.cc @@ -54,14 +54,14 @@ void ModelTrainingInstance::update() { get_optimizer_attrs_for_next_iter(this->optimizer_attrs); } -void ModelTrainingInstance::write_loss_tensor_to_host(float *host_ptr) { +GenericTensorAccessorR ModelTrainingInstance::get_loss_tensor_accessor() const { gradient_tensor_t loss_tensor = this->training_backing.local_tensor_backing.tensor_gradient_mapping.at( this->logit_tensor); GenericTensorAccessorW loss_tensor_backing = this->training_backing.local_tensor_backing.tensor_backings.at( TensorTypeVariant{loss_tensor}); - write_to_host_float_ptr(loss_tensor_backing, host_ptr); + return read_only_accessor_from_write_accessor(loss_tensor_backing); } } // namespace FlexFlow diff --git a/lib/local-execution/test/src/test_e2e.cc b/lib/local-execution/test/src/test_e2e.cc index b527430d67..6dabe09799 100644 --- a/lib/local-execution/test/src/test_e2e.cc +++ b/lib/local-execution/test/src/test_e2e.cc @@ -1,3 +1,5 @@ +#include "kernels/copy_tensor_accessor.h" +#include "kernels/local_cpu_allocator.h" #include "kernels/local_cuda_allocator.h" #include "kernels/managed_ff_stream.h" #include "kernels/managed_per_device_ff_handle.h" @@ -14,7 +16,10 @@ using namespace ::FlexFlow; -bool did_loss_decrease(float *first_epoch, float *last_epoch, int batch_size) { +bool did_loss_decrease( + GenericTensorAccessorR const &first_epoch, + GenericTensorAccessorR const &last_epoch +) { for (int i = 0; i < batch_size; i++) { if (first_epoch[i] < last_epoch[i]) { return false; @@ -27,7 +32,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("LocalBackend e2e Training") { // initialize runtime ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true + ); Allocator allocator = create_local_cuda_memory_allocator(); @@ -146,28 +154,26 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { loss_attrs, optimizer_attrs}; + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + int num_epochs = 5; - int num_samples = batch_size.unwrap_nonnegative(); - std::vector loss_values(num_epochs); + std::vector loss_values; for (int i = 0; i < num_epochs; i++) { model_training_instance.forward(); model_training_instance.backward(); model_training_instance.update(); - float *host_loss_ptr = new float[num_samples]; - model_training_instance.write_loss_tensor_to_host(host_loss_ptr); - loss_values[i] = host_loss_ptr; + loss_values.push_back( + copy_tensor_accessor_r( + model_training_instance.get_loss_tensor_accessor(), + cpu_allocator)); } // Assert that each sample in the batch has a lower loss in last epoch than // the first epoch - float *first_epoch = loss_values[0]; - float *last_epoch = loss_values[num_epochs - 1]; + GenericTensorAccessorR first_epoch_loss = loss_values.at(0); + GenericTensorAccessorR last_epoch = loss_values.back(); CHECK(did_loss_decrease( - first_epoch, last_epoch, batch_size.unwrap_nonnegative())); - - for (int i = 0; i < num_epochs; i++) { - delete[] loss_values[i]; - } + first_epoch_loss, last_epoch, batch_size.unwrap_nonnegative())); } } diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc index c9c5afe04e..4d015f4cfa 100644 --- a/lib/local-execution/test/src/test_local_cost_estimator.cc +++ b/lib/local-execution/test/src/test_local_cost_estimator.cc @@ -11,8 +11,10 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("LocalCostEstimator") { - // local backing initialization - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true + ); RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ DeviceSpecific::create(managed_handle.raw_handle()), diff --git a/lib/local-execution/test/src/test_loss_functions.cc b/lib/local-execution/test/src/test_loss_functions.cc index ca2482653b..e8f48413b6 100644 --- a/lib/local-execution/test/src/test_loss_functions.cc +++ b/lib/local-execution/test/src/test_loss_functions.cc @@ -17,7 +17,10 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("LossFunctions") { // initialize runtime ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true + ); Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/local-execution/test/src/test_update.cc b/lib/local-execution/test/src/test_update.cc index 75ba517d1b..18509d1fd9 100644 --- a/lib/local-execution/test/src/test_update.cc +++ b/lib/local-execution/test/src/test_update.cc @@ -15,7 +15,10 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("ExecuteUpdate") { // initialize runtime configs ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true + ); Allocator allocator = create_local_cuda_memory_allocator(); AllocatedTensors allocated_tensors = make_empty_allocated_tensors(); diff --git a/lib/op-attrs/include/op-attrs/datatype.h b/lib/op-attrs/include/op-attrs/datatype.h index e17f51b73a..9996e36482 100644 --- a/lib/op-attrs/include/op-attrs/datatype.h +++ b/lib/op-attrs/include/op-attrs/datatype.h @@ -13,34 +13,58 @@ template struct data_type_enum_to_class; template <> -struct data_type_enum_to_class { - using type = float; -}; +struct data_type_enum_to_class + : type_identity {}; template <> -struct data_type_enum_to_class { - using type = double; -}; +struct data_type_enum_to_class + : type_identity {}; template <> -struct data_type_enum_to_class { - using type = int32_t; -}; +struct data_type_enum_to_class + : type_identity {}; template <> -struct data_type_enum_to_class { - using type = int64_t; -}; +struct data_type_enum_to_class + : type_identity {}; template <> -struct data_type_enum_to_class { - using type = half; -}; +struct data_type_enum_to_class + : type_identity {}; template <> -struct data_type_enum_to_class { - using type = bool; -}; +struct data_type_enum_to_class + : type_identity {}; + +template +struct type_to_data_type_enum; + +template <> +struct type_to_data_type_enum + : std::integral_constant {}; + +template <> +struct type_to_data_type_enum + : std::integral_constant {}; + +template <> +struct type_to_data_type_enum + : std::integral_constant {}; + +template <> +struct type_to_data_type_enum + : std::integral_constant {}; + +template <> +struct type_to_data_type_enum + : std::integral_constant {}; + +template <> +struct type_to_data_type_enum + : std::integral_constant {}; + +template +inline constexpr DataType type_to_data_type_enum_v = type_to_data_type_enum::value; template typename data_type_enum_to_class
::type cast_to(T t) { From d1ffea9fd00d35da4b9f5e9b943d06cb25aaf8e2 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Wed, 14 May 2025 18:19:32 -0700 Subject: [PATCH 72/91] Fix typo in task-spec --- .proj.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.proj.toml b/.proj.toml index 3a120ca553..b14d763339 100644 --- a/.proj.toml +++ b/.proj.toml @@ -56,7 +56,7 @@ has-cpu-only-benchmarks = false has-cuda-tests = false has-cuda-benchmarks = false -[targets.task_spec] +[targets.task-spec] type = "lib" has-cpu-only-tests = true has-cpu-only-benchmarks = false From 7e45215be7b49d3a5f10140b5732b6e7d6bca658 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Wed, 21 May 2025 08:18:08 +0000 Subject: [PATCH 73/91] Add positive_int and tensor reductions/comparisons --- .../src/compiler/allowed_machine_views.cc | 30 +- .../get_machine_resource_splits.cc | 8 +- .../test/src/allowed_machine_views.cc | 40 +- ...racted_tensor_set_movement_across_split.cc | 8 +- .../get_machine_resource_splits.cc | 190 +++---- .../get_optimal_machine_mapping.cc | 22 +- .../get_tensor_set_movement_across_split.cc | 16 +- .../machine_mapping/machine_mapping.cc | 8 +- .../get_machine_mapping_problem_tree.cc | 6 +- .../machine_mapping/machine_mapping_result.cc | 12 +- ...get_optimal_machine_mapping_with_memory.cc | 22 +- .../machine_mapping_result_with_memory.cc | 20 +- ...ion_graph_series_parallel_decomposition.cc | 22 +- .../get_pcg_series_parallel_decomposition.cc | 32 +- .../task_graph_simulator/task_simulator.cc | 34 +- lib/compiler/test/src/graph_optimize_state.cc | 24 +- lib/kernels/include/kernels/accessor.h | 11 +- lib/kernels/include/kernels/array_coord.h | 2 +- lib/kernels/include/kernels/array_shape.h | 34 +- .../kernels/create_accessor_with_contents.h | 63 +-- .../include/kernels/fill_tensor_accessor.h | 22 + lib/kernels/include/kernels/legion_dim.h | 3 +- .../include/kernels/map_tensor_accessors.h | 51 +- .../include/kernels/optimizer_kernels.h | 4 +- .../include/kernels/reduce_tensor_accessor.h | 88 +++ .../include/kernels/reduce_tensor_accessors.h | 39 -- .../reverse_kernels_params.struct.toml | 10 +- .../kernels/tensor_accessor_reductions.h | 13 + lib/kernels/src/cpu/ops/cast_kernels.cc | 4 +- lib/kernels/src/cpu/ops/combine_kernels.cc | 6 +- .../src/cpu/ops/initializer_kernels.cc | 4 +- lib/kernels/src/cpu/ops/replicate_kernels.cc | 12 +- lib/kernels/src/cpu/ops/reverse_kernels.cc | 4 +- lib/kernels/src/cuda/cuda_helper.cu | 8 +- lib/kernels/src/cuda/embedding_kernels.cu | 48 +- lib/kernels/src/cuda/ops/cast_kernels.cu | 4 +- lib/kernels/src/cuda/ops/combine_kernels.cu | 6 +- lib/kernels/src/cuda/ops/concat_kernels.cu | 4 +- lib/kernels/src/cuda/ops/conv_2d_kernels.cu | 16 +- .../src/cuda/ops/element_unary_kernels.cu | 8 +- lib/kernels/src/cuda/ops/flat_kernels.cu | 6 +- lib/kernels/src/cuda/ops/gather_kernels.cu | 16 +- lib/kernels/src/cuda/ops/partition_kernels.cu | 8 +- lib/kernels/src/cuda/ops/reduction_kernels.cu | 8 +- lib/kernels/src/cuda/ops/replicate_kernels.cu | 8 +- lib/kernels/src/cuda/ops/reshape_kernels.cu | 8 +- lib/kernels/src/cuda/ops/reverse_kernels.cu | 16 +- lib/kernels/src/cuda/ops/transpose_kernels.cu | 16 +- lib/kernels/src/cuda/optimizer_kernels.cu | 2 +- lib/kernels/src/kernels/accessor.cc | 65 ++- lib/kernels/src/kernels/allocation.cc | 2 +- lib/kernels/src/kernels/array_shape.cc | 57 +- .../src/kernels/compare_tensor_accessors.cc | 36 +- .../src/kernels/fill_tensor_accessor.cc | 26 + .../src/kernels/format_accessor_contents.cc | 25 +- .../src/kernels/map_tensor_accessors.cc | 13 +- .../src/kernels/reduce_tensor_accessor.cc | 17 + .../src/kernels/reverse_kernels_params.cc | 10 +- .../src/kernels/tensor_accessor_reductions.cc | 27 + lib/kernels/test/CMakeLists.txt | 7 - lib/kernels/test/modify_test_commands.cmake | 21 - .../test/src/cpu/ops/replicate_kernels.cc | 4 +- .../test/src/cpu/ops/reverse_kernels.cc | 4 +- lib/kernels/test/src/internal/test_utils.cc | 12 +- lib/kernels/test/src/kernels/accessor.cc | 70 ++- lib/kernels/test/src/kernels/array_shape.cc | 63 ++- .../src/kernels/compare_tensor_accessors.cc | 163 ++++++ .../kernels/create_accessor_with_contents.cc | 133 +++++ .../test/src/kernels/map_tensor_accessors.cc | 151 +++++ .../src/kernels/reduce_tensor_accessor.cc | 68 +++ .../src/kernels/tensor_accessor_reductions.cc | 106 ++++ lib/kernels/test/src/test_attention_kernel.cc | 46 +- .../test/src/test_batch_matmul_kernel.cc | 24 +- .../test/src/test_batch_norm_kernel.cc | 18 +- lib/kernels/test/src/test_cast_kernel.cc | 8 +- lib/kernels/test/src/test_combine_kernel.cc | 4 +- lib/kernels/test/src/test_concat_kernel.cc | 30 +- lib/kernels/test/src/test_dropout.cc | 4 +- lib/kernels/test/src/test_flat_kernel.cc | 2 +- lib/kernels/test/src/test_gather_kernels.cc | 18 +- .../test/src/test_layer_norm_kernels.cc | 8 +- .../test/src/test_managed_ff_stream.cc | 18 +- lib/kernels/test/src/test_partition_kernel.cc | 2 +- lib/kernels/test/src/test_pool_2d_kernels.cc | 48 +- lib/kernels/test/src/test_reduction_kernel.cc | 4 +- lib/kernels/test/src/test_replicate_kernel.cc | 14 +- lib/kernels/test/src/test_reshape_kernel.cc | 2 +- lib/kernels/test/src/test_reverse_kernels.cc | 4 +- lib/kernels/test/src/test_softmax_kernel.cc | 4 +- lib/kernels/test/src/test_split_kernel.cc | 4 +- lib/kernels/test/src/test_transpose_kernel.cc | 2 +- lib/local-execution/src/loss_functions.cc | 54 +- lib/local-execution/src/optimizer.cc | 26 +- .../test/src/test_allocated_tensors.cc | 8 +- lib/local-execution/test/src/test_e2e.cc | 30 +- .../test/src/test_local_cost_estimator.cc | 16 +- .../test/src/test_local_slots_backing.cc | 276 --------- .../test/src/test_local_task_arg_accessor.cc | 12 +- .../test/src/test_local_tensor_backing.cc | 6 +- .../test/src/test_loss_functions.cc | 16 +- .../test/src/test_task_registry.cc | 8 +- .../test/src/test_unallocated_tensors.cc | 8 +- lib/local-execution/test/src/test_update.cc | 12 +- .../models/bert/bert_config.struct.toml | 16 +- .../candle_uno/candle_uno_config.struct.toml | 10 +- .../models/dlrm/dlrm_config.struct.toml | 14 +- .../inception_v3_config.struct.toml | 6 +- .../include/models/split_test/split_test.h | 2 +- .../transformer_config.struct.toml | 18 +- lib/models/src/models/bert/bert.cc | 22 +- .../src/models/candle_uno/candle_uno.cc | 24 +- lib/models/src/models/dlrm/dlrm.cc | 44 +- .../src/models/inception_v3/inception_v3.cc | 530 +++++++++--------- .../src/models/split_test/split_test.cc | 12 +- .../src/models/transformer/transformer.cc | 26 +- lib/op-attrs/include/op-attrs/datatype.h | 6 +- .../include/op-attrs/datatype_value.h | 3 + .../initializers/kaiming_initializer_mode.h | 4 +- lib/op-attrs/include/op-attrs/ops/attention.h | 36 +- .../multihead_attention_inputs.struct.toml | 12 +- .../op-attrs/ops/attention_attrs.struct.toml | 10 +- .../op-attrs/ops/combine_attrs.struct.toml | 4 +- .../conv_2d/conv_2d_input_shape.struct.toml | 10 +- .../conv_2d_parallel_input_shape.struct.toml | 6 +- .../op-attrs/ops/conv_2d_attrs.struct.toml | 13 +- .../op-attrs/ops/embedding_attrs.struct.toml | 6 +- .../op-attrs/ops/linear_attrs.struct.toml | 4 +- lib/op-attrs/include/op-attrs/ops/pool_2d.h | 4 +- .../op-attrs/ops/pool_2d_attrs.struct.toml | 9 +- .../op-attrs/ops/reduction_attrs.struct.toml | 4 +- .../ops/repartition_attrs.struct.toml | 4 +- .../op-attrs/ops/replicate_attrs.struct.toml | 4 +- .../op-attrs/ops/topk_attrs.struct.toml | 4 +- .../parallel_tensor_dim_degrees.struct.toml | 4 +- .../include/op-attrs/parallel_tensor_dims.h | 10 +- .../include/op-attrs/parallel_tensor_shape.h | 14 +- .../discard_copy_degree.struct.toml | 4 +- .../sum_degree.struct.toml | 4 +- .../op-attrs/replica_parallel_dim.struct.toml | 4 +- .../op-attrs/replica_parallel_dim_set.h | 3 +- .../op-attrs/shard_parallel_dim.struct.toml | 6 +- lib/op-attrs/include/op-attrs/tensor_dims.h | 8 +- .../include/op-attrs/tensor_dims.struct.toml | 4 +- lib/op-attrs/include/op-attrs/tensor_shape.h | 8 +- lib/op-attrs/src/op-attrs/datatype.cc | 14 +- lib/op-attrs/src/op-attrs/datatype_value.cc | 11 + .../src/op-attrs/initializer_attrs.cc | 14 +- .../initializers/kaiming_initializer_mode.cc | 8 +- lib/op-attrs/src/op-attrs/ops/attention.cc | 86 +-- .../attention/multihead_attention_inputs.cc | 18 +- .../multihead_attention_parallel_inputs.cc | 6 +- lib/op-attrs/src/op-attrs/ops/batch_matmul.cc | 21 +- lib/op-attrs/src/op-attrs/ops/batch_norm.cc | 18 +- lib/op-attrs/src/op-attrs/ops/combine.cc | 6 +- lib/op-attrs/src/op-attrs/ops/concat.cc | 14 +- lib/op-attrs/src/op-attrs/ops/conv_2d.cc | 47 +- .../ops/conv_2d/conv_2d_input_shape.cc | 8 +- lib/op-attrs/src/op-attrs/ops/embedding.cc | 16 +- lib/op-attrs/src/op-attrs/ops/flat.cc | 10 +- lib/op-attrs/src/op-attrs/ops/layer_norm.cc | 6 +- lib/op-attrs/src/op-attrs/ops/linear.cc | 22 +- lib/op-attrs/src/op-attrs/ops/pool_2d.cc | 50 +- lib/op-attrs/src/op-attrs/ops/reduction.cc | 5 +- lib/op-attrs/src/op-attrs/ops/weight.cc | 2 +- .../src/op-attrs/parallel_tensor_dims.cc | 24 +- .../src/op-attrs/parallel_tensor_shape.cc | 20 +- .../src/op-attrs/replica_parallel_dim_set.cc | 8 +- lib/op-attrs/src/op-attrs/tensor_dims.cc | 12 +- lib/op-attrs/src/op-attrs/tensor_shape.cc | 8 +- .../test/src/op-attrs/ops/attention.cc | 132 ++--- .../test/src/op-attrs/ops/batch_matmul.cc | 126 ++--- .../test/src/op-attrs/ops/batch_norm.cc | 84 +-- lib/op-attrs/test/src/op-attrs/ops/cast.cc | 28 +- lib/op-attrs/test/src/op-attrs/ops/combine.cc | 19 +- lib/op-attrs/test/src/op-attrs/ops/concat.cc | 160 +++--- lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc | 118 ++-- lib/op-attrs/test/src/op-attrs/ops/dropout.cc | 48 +- .../test/src/op-attrs/ops/element_binary.cc | 70 +-- .../test/src/op-attrs/ops/element_unary.cc | 30 +- .../test/src/op-attrs/ops/embedding.cc | 56 +- lib/op-attrs/test/src/op-attrs/ops/flat.cc | 130 ++--- .../test/src/op-attrs/ops/layer_norm.cc | 84 +-- lib/op-attrs/test/src/op-attrs/ops/linear.cc | 112 ++-- lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc | 200 +++---- .../test/src/op-attrs/ops/reduction.cc | 19 +- .../test/src/op-attrs/ops/repartition.cc | 14 +- .../test/src/op-attrs/ops/replicate.cc | 16 +- lib/op-attrs/test/src/op-attrs/ops/softmax.cc | 52 +- .../test/src/op-attrs/pcg_operator_attrs.cc | 2 +- lib/op-attrs/test/src/op-attrs/tensor_dims.cc | 24 +- .../include/pcg/computation_graph_builder.h | 38 +- lib/pcg/include/pcg/machine_specification.h | 8 +- .../pcg/machine_specification.struct.toml | 8 +- lib/pcg/include/pcg/operator_task_space.h | 2 +- .../pcg/operator_task_space.struct.toml | 6 +- .../parallel_computation_graph_builder.h | 34 +- lib/pcg/include/pcg/stride_t.struct.toml | 4 +- lib/pcg/src/pcg/computation_graph_builder.cc | 42 +- lib/pcg/src/pcg/machine_specification.cc | 9 +- lib/pcg/src/pcg/machine_view.cc | 8 +- lib/pcg/src/pcg/operator_task_space.cc | 8 +- .../generate_weight_transform.cc | 4 +- .../parallel_computation_graph_builder.cc | 38 +- lib/pcg/test/src/pcg/computation_graph.cc | 40 +- .../test/src/pcg/computation_graph_builder.cc | 14 +- .../file_format/v1/v1_computation_graph.cc | 8 +- .../v1/v1_parallel_computation_graph.cc | 10 +- lib/pcg/test/src/pcg/machine_specification.cc | 6 +- lib/pcg/test/src/pcg/machine_view.cc | 74 +-- lib/pcg/test/src/pcg/operator_task_space.cc | 8 +- .../parallel_computation_graph.cc | 34 +- .../parallel_computation_graph_builder.cc | 136 ++--- .../src/pcg/pcg_from_computation_graph.cc | 8 +- .../src/pcg/start_invariant_machine_view.cc | 32 +- .../operator_attribute_value.variant.toml | 6 +- .../materialize_operator_from_attrs_map.cc | 18 +- .../apply_substitution/apply_substitution.cc | 20 +- .../evaluate_substitution_output.cc | 16 +- .../perform_shape_inference.cc | 14 +- .../operator_pattern/get_attribute.cc | 2 +- .../test/src/substitutions/pcg_pattern.cc | 20 +- .../substitutions/unity_substitution_set.cc | 6 +- lib/task-spec/CMakeLists.txt | 2 + lib/task-spec/src/task-spec/ops/attention.cc | 54 +- .../src/task-spec/ops/batch_matmul.cc | 80 +-- lib/task-spec/src/task-spec/ops/batch_norm.cc | 18 +- lib/task-spec/src/task-spec/ops/conv_2d.cc | 10 +- lib/task-spec/src/task-spec/ops/gather.cc | 6 +- lib/task-spec/src/task-spec/ops/layer_norm.cc | 12 +- lib/task-spec/src/task-spec/ops/linear.cc | 32 +- lib/task-spec/src/task-spec/ops/pool_2d.cc | 40 +- lib/task-spec/src/task-spec/ops/reduce.cc | 2 +- lib/task-spec/src/task-spec/ops/reduction.cc | 4 +- lib/task-spec/src/task-spec/ops/replicate.cc | 2 +- lib/task-spec/src/task-spec/ops/softmax.cc | 18 +- lib/task-spec/src/task-spec/ops/split.cc | 20 +- lib/task-spec/src/task-spec/ops/topk.cc | 20 +- lib/task-spec/test/src/task-spec/arg_ref.cc | 2 - lib/utils/include/utils/containers/sum.h | 17 +- .../include/utils/nonnegative_int/ceildiv.h | 1 + .../include/utils/positive_int/ceildiv.h | 12 + .../include/utils/positive_int/positive_int.h | 114 ++++ .../src/utils/nonnegative_int/ceildiv.cc | 20 - lib/utils/src/utils/positive_int/ceildiv.cc | 14 + .../src/utils/positive_int/positive_int.cc | 283 ++++++++++ lib/utils/test/src/utils/containers/sum.cc | 18 + .../test/src/utils/nonnegative_int/ceildiv.cc | 52 -- .../test/src/utils/positive_int/ceildiv.cc | 28 + .../src/utils/positive_int/positive_int.cc | 10 + 249 files changed, 4212 insertions(+), 3136 deletions(-) create mode 100644 lib/kernels/include/kernels/fill_tensor_accessor.h create mode 100644 lib/kernels/include/kernels/reduce_tensor_accessor.h delete mode 100644 lib/kernels/include/kernels/reduce_tensor_accessors.h create mode 100644 lib/kernels/include/kernels/tensor_accessor_reductions.h create mode 100644 lib/kernels/src/kernels/fill_tensor_accessor.cc create mode 100644 lib/kernels/src/kernels/reduce_tensor_accessor.cc create mode 100644 lib/kernels/src/kernels/tensor_accessor_reductions.cc delete mode 100644 lib/kernels/test/modify_test_commands.cmake create mode 100644 lib/kernels/test/src/kernels/create_accessor_with_contents.cc create mode 100644 lib/kernels/test/src/kernels/map_tensor_accessors.cc create mode 100644 lib/kernels/test/src/kernels/reduce_tensor_accessor.cc create mode 100644 lib/kernels/test/src/kernels/tensor_accessor_reductions.cc delete mode 100644 lib/local-execution/test/src/test_local_slots_backing.cc create mode 100644 lib/utils/include/utils/positive_int/ceildiv.h create mode 100644 lib/utils/include/utils/positive_int/positive_int.h delete mode 100644 lib/utils/src/utils/nonnegative_int/ceildiv.cc create mode 100644 lib/utils/src/utils/positive_int/ceildiv.cc create mode 100644 lib/utils/src/utils/positive_int/positive_int.cc delete mode 100644 lib/utils/test/src/utils/nonnegative_int/ceildiv.cc create mode 100644 lib/utils/test/src/utils/positive_int/ceildiv.cc create mode 100644 lib/utils/test/src/utils/positive_int/positive_int.cc diff --git a/lib/compiler/src/compiler/allowed_machine_views.cc b/lib/compiler/src/compiler/allowed_machine_views.cc index 6f86d1d82a..fa543e78b5 100644 --- a/lib/compiler/src/compiler/allowed_machine_views.cc +++ b/lib/compiler/src/compiler/allowed_machine_views.cc @@ -17,7 +17,7 @@ #include "utils/containers/unordered_multiset_of.h" #include "utils/containers/unordered_set_of.h" #include "utils/containers/zip.h" -#include "utils/nonnegative_int/ceildiv.h" +#include "utils/positive_int/ceildiv.h" #include "utils/nonnegative_int/nonnegative_range.h" #include "utils/nonnegative_int/num_elements.h" #include "utils/overload.h" @@ -51,24 +51,24 @@ static std::unordered_set DeviceType const &device_type) { auto get_max_stride_upper_bound = - [](std::vector const &tensor_dims, - nonnegative_int total_devices) -> nonnegative_int { + [](std::vector const &tensor_dims, + positive_int total_devices) -> positive_int { nonnegative_int min_num_devices_with_full_stride_volume = - product(transform(tensor_dims, [](nonnegative_int num_devices) { - return nonnegative_int{num_devices.unwrap_nonnegative() - 1}; + product(transform(tensor_dims, [](positive_int num_devices) { + return nonnegative_int{num_devices.int_from_positive_int() - 1}; })); - return ceildiv(total_devices, min_num_devices_with_full_stride_volume); + return ceildiv(total_devices, positive_int{min_num_devices_with_full_stride_volume}); }; - auto candidate_strides = [&](std::vector const &tensor_dims, - nonnegative_int total_devices) + auto candidate_strides = [&](std::vector const &tensor_dims, + positive_int total_devices) -> std::unordered_multiset { - nonnegative_int max_stride_upper_bound = + positive_int max_stride_upper_bound = get_max_stride_upper_bound(tensor_dims, total_devices); std::vector single_stride_range = - transform(nonnegative_range(1_n, max_stride_upper_bound + 1_n), - [](nonnegative_int stride) { return stride_t{stride}; }); + transform(nonnegative_range(1_n, max_stride_upper_bound.nonnegative_int_from_positive_int() + 1_n), + [](nonnegative_int stride) { return stride_t{positive_int{stride}}; }); std::unordered_multiset> raw_stride_vectors = cartesian_product( repeat_element(/*num_times=*/num_elements(tensor_dims), @@ -83,9 +83,9 @@ static std::unordered_set auto candidate_starts = [](MachineSpecification const &ms, DeviceType const &device_type) { std::unordered_set result; - for (nonnegative_int node_idx : nonnegative_range(ms.num_nodes)) { + for (nonnegative_int node_idx : nonnegative_range(ms.num_nodes.nonnegative_int_from_positive_int())) { for (nonnegative_int device_idx : - nonnegative_range(get_num_devices_per_node(ms, device_type))) { + nonnegative_range(get_num_devices_per_node(ms, device_type).nonnegative_int_from_positive_int())) { result.insert( MachineSpaceCoordinate{node_idx, device_idx, device_type}); } @@ -100,8 +100,8 @@ static std::unordered_set return get_all_permutations_with_repetition(options, num_dims(task)); }; - std::vector tensor_dims = task.degrees; - nonnegative_int total_devices = get_num_devices(machine_spec, device_type); + std::vector tensor_dims = task.degrees; + positive_int total_devices = get_num_devices(machine_spec, device_type); std::unordered_set machine_views; diff --git a/lib/compiler/src/compiler/machine_mapping/get_machine_resource_splits.cc b/lib/compiler/src/compiler/machine_mapping/get_machine_resource_splits.cc index bb9d54f1e9..e921a0c465 100644 --- a/lib/compiler/src/compiler/machine_mapping/get_machine_resource_splits.cc +++ b/lib/compiler/src/compiler/machine_mapping/get_machine_resource_splits.cc @@ -11,9 +11,9 @@ std::unordered_set> for (int i = 1; i < resource.num_nodes; i *= 2) { MachineSpecification sub_resource1 = resource; MachineSpecification sub_resource2 = resource; - sub_resource1.num_nodes = nonnegative_int{i}; + sub_resource1.num_nodes = positive_int{i}; sub_resource2.num_nodes = - nonnegative_int{resource.num_nodes.unwrap_nonnegative() - i}; + positive_int{resource.num_nodes.int_from_positive_int() - i}; result.insert(std::make_pair(sub_resource1, sub_resource2)); result.insert(std::make_pair(sub_resource2, sub_resource1)); } @@ -21,9 +21,9 @@ std::unordered_set> for (int i = 1; i < resource.num_gpus_per_node; i *= 2) { MachineSpecification sub_resource1 = resource; MachineSpecification sub_resource2 = resource; - sub_resource1.num_gpus_per_node = nonnegative_int{i}; + sub_resource1.num_gpus_per_node = positive_int{i}; sub_resource2.num_gpus_per_node = - nonnegative_int{resource.num_gpus_per_node.unwrap_nonnegative() - i}; + positive_int{resource.num_gpus_per_node.int_from_positive_int() - i}; result.insert(std::make_pair(sub_resource1, sub_resource2)); result.insert(std::make_pair(sub_resource2, sub_resource1)); } diff --git a/lib/compiler/test/src/allowed_machine_views.cc b/lib/compiler/test/src/allowed_machine_views.cc index 817cc80700..15f7d60060 100644 --- a/lib/compiler/test/src/allowed_machine_views.cc +++ b/lib/compiler/test/src/allowed_machine_views.cc @@ -15,39 +15,39 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("1 degree of parallelism") { MachineSpecification ms = MachineSpecification{ - /*num_nodes=*/1_n, - /*num_cpus_per_node=*/5_n, - /*num_gpus_per_node=*/5_n, + /*num_nodes=*/1_p, + /*num_cpus_per_node=*/5_p, + /*num_gpus_per_node=*/5_p, /*inter_node_bandwidth=*/0, /*intra_node_bandwidth=*/0, }; - OperatorTaskSpace task = OperatorTaskSpace{{3_n}}; + OperatorTaskSpace task = OperatorTaskSpace{{3_p}}; std::unordered_set correct = { MachineView{ MachineSpaceCoordinate{ /*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU}, - {MachineViewDimension{stride_t{1_n}, + {MachineViewDimension{stride_t{1_p}, MachineSpecificationDimension::INTRA_NODE}}, }, MachineView{ MachineSpaceCoordinate{ /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU}, - {MachineViewDimension{stride_t{1_n}, + {MachineViewDimension{stride_t{1_p}, MachineSpecificationDimension::INTRA_NODE}}, }, MachineView{ MachineSpaceCoordinate{ /*node_idx=*/0_n, /*device_idx=*/2_n, DeviceType::GPU}, - {MachineViewDimension{stride_t{1_n}, + {MachineViewDimension{stride_t{1_p}, MachineSpecificationDimension::INTRA_NODE}}, }, MachineView{ MachineSpaceCoordinate{ /*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU}, - {MachineViewDimension{stride_t{2_n}, + {MachineViewDimension{stride_t{2_p}, MachineSpecificationDimension::INTRA_NODE}}, }, }; @@ -61,18 +61,18 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("2 degrees of parallelism") { MachineSpecification ms = MachineSpecification{ - /*num_nodes=*/3_n, - /*num_cpus_per_node=*/3_n, - /*num_gpus_per_node=*/3_n, + /*num_nodes=*/3_p, + /*num_cpus_per_node=*/3_p, + /*num_gpus_per_node=*/3_p, /*inter_node_bandwidth=*/0, /*intra_node_bandwidth=*/0, }; - OperatorTaskSpace task = OperatorTaskSpace{{2_n, 3_n}}; + OperatorTaskSpace task = OperatorTaskSpace{{2_p, 3_p}}; auto make_2d_view = [&](nonnegative_int start_node_idx, nonnegative_int start_device_idx, - nonnegative_int stride1, - nonnegative_int stride2, + positive_int stride1, + positive_int stride2, MachineSpecificationDimension m1, MachineSpecificationDimension m2) { return MachineView{ @@ -87,18 +87,18 @@ TEST_SUITE(FF_TEST_SUITE) { auto inter = MachineSpecificationDimension::INTER_NODE; std::unordered_set correct = { make_2d_view( - 0_n, 0_n, /*stride1=*/1_n, /*stride2=*/1_n, inter, intra), + 0_n, 0_n, /*stride1=*/1_p, /*stride2=*/1_p, inter, intra), make_2d_view( - 1_n, 0_n, /*stride1=*/1_n, /*stride2=*/1_n, inter, intra), + 1_n, 0_n, /*stride1=*/1_p, /*stride2=*/1_p, inter, intra), make_2d_view( - 0_n, 0_n, /*stride1=*/2_n, /*stride2=*/1_n, inter, intra), + 0_n, 0_n, /*stride1=*/2_p, /*stride2=*/1_p, inter, intra), make_2d_view( - 0_n, 0_n, /*stride1=*/1_n, /*stride2=*/1_n, intra, inter), + 0_n, 0_n, /*stride1=*/1_p, /*stride2=*/1_p, intra, inter), make_2d_view( - 0_n, 1_n, /*stride1=*/1_n, /*stride2=*/1_n, intra, inter), + 0_n, 1_n, /*stride1=*/1_p, /*stride2=*/1_p, intra, inter), make_2d_view( - 0_n, 0_n, /*stride1=*/2_n, /*stride2=*/1_n, intra, inter), + 0_n, 0_n, /*stride1=*/2_p, /*stride2=*/1_p, intra, inter), }; std::unordered_set result = diff --git a/lib/compiler/test/src/compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.cc b/lib/compiler/test/src/compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.cc index 13067f5d02..0416a73660 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.cc @@ -28,9 +28,9 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = TensorShape{ TensorDims{ - FFOrdered{ - 10_n, - 12_n, + FFOrdered{ + 10_p, + 12_p, }, }, DataType::FLOAT, @@ -42,7 +42,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*op_attrs=*/PCGOperatorAttrs{ RepartitionAttrs{ /*repartition_dim=*/ff_dim_t{0_n}, - /*repartition_degree=*/2_n, + /*repartition_degree=*/2_p, }, }, /*name=*/std::nullopt, diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_machine_resource_splits.cc b/lib/compiler/test/src/compiler/machine_mapping/get_machine_resource_splits.cc index 5f4ba2bfdc..5ae89a8123 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/get_machine_resource_splits.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/get_machine_resource_splits.cc @@ -8,11 +8,11 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_machine_resource_splits") { - auto make_machine_spec = [](nonnegative_int num_nodes, - nonnegative_int num_gpus_per_node) { + auto make_machine_spec = [](positive_int num_nodes, + positive_int num_gpus_per_node) { return MachineSpecification{ /*num_nodes=*/num_nodes, - /*num_cpus_per_node=*/1_n, + /*num_cpus_per_node=*/1_p, /*num_gpus_per_node=*/num_gpus_per_node, /*inter_node_bandwidth=*/1.0, /*intra_node_bandwidth=*/1.0, @@ -20,8 +20,8 @@ TEST_SUITE(FF_TEST_SUITE) { }; SUBCASE("returns no splits if no splits are possible") { - MachineSpecification input = make_machine_spec(/*num_nodes=*/1_n, - /*num_gpus_per_node=*/1_n); + MachineSpecification input = make_machine_spec(/*num_nodes=*/1_p, + /*num_gpus_per_node=*/1_p); std::unordered_set> result = get_machine_resource_splits(input); @@ -33,8 +33,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE( "returns splits in gpu and node dimensions, but not at the same time") { - MachineSpecification input = make_machine_spec(/*num_nodes=*/2_n, - /*num_gpus_per_node=*/2_n); + MachineSpecification input = make_machine_spec(/*num_nodes=*/2_p, + /*num_gpus_per_node=*/2_p); std::unordered_set> result = get_machine_resource_splits(input); @@ -42,16 +42,16 @@ TEST_SUITE(FF_TEST_SUITE) { std::unordered_set> correct = { { - make_machine_spec(/*num_nodes=*/2_n, - /*num_gpus_per_node=*/1_n), - make_machine_spec(/*num_nodes=*/2_n, - /*num_gpus_per_node=*/1_n), + make_machine_spec(/*num_nodes=*/2_p, + /*num_gpus_per_node=*/1_p), + make_machine_spec(/*num_nodes=*/2_p, + /*num_gpus_per_node=*/1_p), }, { - make_machine_spec(/*num_nodes=*/1_n, - /*num_gpus_per_node=*/2_n), - make_machine_spec(/*num_nodes=*/1_n, - /*num_gpus_per_node=*/2_n), + make_machine_spec(/*num_nodes=*/1_p, + /*num_gpus_per_node=*/2_p), + make_machine_spec(/*num_nodes=*/1_p, + /*num_gpus_per_node=*/2_p), }, }; @@ -62,8 +62,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("returns splits in node dimension in powers of two") { SUBCASE("num_nodes is a power of 2") { MachineSpecification input = - make_machine_spec(/*num_nodes=*/8_n, - /*num_gpus_per_node=*/1_n); + make_machine_spec(/*num_nodes=*/8_p, + /*num_gpus_per_node=*/1_p); std::unordered_set< std::pair> @@ -73,34 +73,34 @@ TEST_SUITE(FF_TEST_SUITE) { std::pair> correct = { { - make_machine_spec(/*num_nodes=*/1_n, - /*num_gpus_per_node=*/1_n), - make_machine_spec(/*num_nodes=*/7_n, - /*num_gpus_per_node=*/1_n), + make_machine_spec(/*num_nodes=*/1_p, + /*num_gpus_per_node=*/1_p), + make_machine_spec(/*num_nodes=*/7_p, + /*num_gpus_per_node=*/1_p), }, { - make_machine_spec(/*num_nodes=*/2_n, - /*num_gpus_per_node=*/1_n), - make_machine_spec(/*num_nodes=*/6_n, - /*num_gpus_per_node=*/1_n), + make_machine_spec(/*num_nodes=*/2_p, + /*num_gpus_per_node=*/1_p), + make_machine_spec(/*num_nodes=*/6_p, + /*num_gpus_per_node=*/1_p), }, { - make_machine_spec(/*num_nodes=*/4_n, - /*num_gpus_per_node=*/1_n), - make_machine_spec(/*num_nodes=*/4_n, - /*num_gpus_per_node=*/1_n), + make_machine_spec(/*num_nodes=*/4_p, + /*num_gpus_per_node=*/1_p), + make_machine_spec(/*num_nodes=*/4_p, + /*num_gpus_per_node=*/1_p), }, { - make_machine_spec(/*num_nodes=*/6_n, - /*num_gpus_per_node=*/1_n), - make_machine_spec(/*num_nodes=*/2_n, - /*num_gpus_per_node=*/1_n), + make_machine_spec(/*num_nodes=*/6_p, + /*num_gpus_per_node=*/1_p), + make_machine_spec(/*num_nodes=*/2_p, + /*num_gpus_per_node=*/1_p), }, { - make_machine_spec(/*num_nodes=*/7_n, - /*num_gpus_per_node=*/1_n), - make_machine_spec(/*num_nodes=*/1_n, - /*num_gpus_per_node=*/1_n), + make_machine_spec(/*num_nodes=*/7_p, + /*num_gpus_per_node=*/1_p), + make_machine_spec(/*num_nodes=*/1_p, + /*num_gpus_per_node=*/1_p), }, }; @@ -109,8 +109,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("num_nodes is not a power of 2") { MachineSpecification input = - make_machine_spec(/*num_nodes=*/6_n, - /*num_gpus_per_node=*/1_n); + make_machine_spec(/*num_nodes=*/6_p, + /*num_gpus_per_node=*/1_p); std::unordered_set< std::pair> @@ -120,28 +120,28 @@ TEST_SUITE(FF_TEST_SUITE) { std::pair> correct = { { - make_machine_spec(/*num_nodes=*/1_n, - /*num_gpus_per_node=*/1_n), - make_machine_spec(/*num_nodes=*/5_n, - /*num_gpus_per_node=*/1_n), + make_machine_spec(/*num_nodes=*/1_p, + /*num_gpus_per_node=*/1_p), + make_machine_spec(/*num_nodes=*/5_p, + /*num_gpus_per_node=*/1_p), }, { - make_machine_spec(/*num_nodes=*/2_n, - /*num_gpus_per_node=*/1_n), - make_machine_spec(/*num_nodes=*/4_n, - /*num_gpus_per_node=*/1_n), + make_machine_spec(/*num_nodes=*/2_p, + /*num_gpus_per_node=*/1_p), + make_machine_spec(/*num_nodes=*/4_p, + /*num_gpus_per_node=*/1_p), }, { - make_machine_spec(/*num_nodes=*/4_n, - /*num_gpus_per_node=*/1_n), - make_machine_spec(/*num_nodes=*/2_n, - /*num_gpus_per_node=*/1_n), + make_machine_spec(/*num_nodes=*/4_p, + /*num_gpus_per_node=*/1_p), + make_machine_spec(/*num_nodes=*/2_p, + /*num_gpus_per_node=*/1_p), }, { - make_machine_spec(/*num_nodes=*/5_n, - /*num_gpus_per_node=*/1_n), - make_machine_spec(/*num_nodes=*/1_n, - /*num_gpus_per_node=*/1_n), + make_machine_spec(/*num_nodes=*/5_p, + /*num_gpus_per_node=*/1_p), + make_machine_spec(/*num_nodes=*/1_p, + /*num_gpus_per_node=*/1_p), }, }; @@ -152,8 +152,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("returns splits in gpu dimension in powers of two") { SUBCASE("num_gpus_per_node is a power of 2") { MachineSpecification input = - make_machine_spec(/*num_nodes=*/1_n, - /*num_gpus_per_node=*/8_n); + make_machine_spec(/*num_nodes=*/1_p, + /*num_gpus_per_node=*/8_p); std::unordered_set< std::pair> @@ -163,34 +163,34 @@ TEST_SUITE(FF_TEST_SUITE) { std::pair> correct = { { - make_machine_spec(/*num_nodes=*/1_n, - /*num_gpus_per_node=*/1_n), - make_machine_spec(/*num_nodes=*/1_n, - /*num_gpus_per_node=*/7_n), + make_machine_spec(/*num_nodes=*/1_p, + /*num_gpus_per_node=*/1_p), + make_machine_spec(/*num_nodes=*/1_p, + /*num_gpus_per_node=*/7_p), }, { - make_machine_spec(/*num_nodes=*/1_n, - /*num_gpus_per_node=*/2_n), - make_machine_spec(/*num_nodes=*/1_n, - /*num_gpus_per_node=*/6_n), + make_machine_spec(/*num_nodes=*/1_p, + /*num_gpus_per_node=*/2_p), + make_machine_spec(/*num_nodes=*/1_p, + /*num_gpus_per_node=*/6_p), }, { - make_machine_spec(/*num_nodes=*/1_n, - /*num_gpus_per_node=*/4_n), - make_machine_spec(/*num_nodes=*/1_n, - /*num_gpus_per_node=*/4_n), + make_machine_spec(/*num_nodes=*/1_p, + /*num_gpus_per_node=*/4_p), + make_machine_spec(/*num_nodes=*/1_p, + /*num_gpus_per_node=*/4_p), }, { - make_machine_spec(/*num_nodes=*/1_n, - /*num_gpus_per_node=*/6_n), - make_machine_spec(/*num_nodes=*/1_n, - /*num_gpus_per_node=*/2_n), + make_machine_spec(/*num_nodes=*/1_p, + /*num_gpus_per_node=*/6_p), + make_machine_spec(/*num_nodes=*/1_p, + /*num_gpus_per_node=*/2_p), }, { - make_machine_spec(/*num_nodes=*/1_n, - /*num_gpus_per_node=*/7_n), - make_machine_spec(/*num_nodes=*/1_n, - /*num_gpus_per_node=*/1_n), + make_machine_spec(/*num_nodes=*/1_p, + /*num_gpus_per_node=*/7_p), + make_machine_spec(/*num_nodes=*/1_p, + /*num_gpus_per_node=*/1_p), }, }; @@ -199,8 +199,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("num_gpus_per_node is not a power of 2") { MachineSpecification input = - make_machine_spec(/*num_nodes=*/1_n, - /*num_gpus_per_node=*/6_n); + make_machine_spec(/*num_nodes=*/1_p, + /*num_gpus_per_node=*/6_p); std::unordered_set< std::pair> @@ -210,28 +210,28 @@ TEST_SUITE(FF_TEST_SUITE) { std::pair> correct = { { - make_machine_spec(/*num_nodes=*/1_n, - /*num_gpus_per_node=*/1_n), - make_machine_spec(/*num_nodes=*/1_n, - /*num_gpus_per_node=*/5_n), + make_machine_spec(/*num_nodes=*/1_p, + /*num_gpus_per_node=*/1_p), + make_machine_spec(/*num_nodes=*/1_p, + /*num_gpus_per_node=*/5_p), }, { - make_machine_spec(/*num_nodes=*/1_n, - /*num_gpus_per_node=*/2_n), - make_machine_spec(/*num_nodes=*/1_n, - /*num_gpus_per_node=*/4_n), + make_machine_spec(/*num_nodes=*/1_p, + /*num_gpus_per_node=*/2_p), + make_machine_spec(/*num_nodes=*/1_p, + /*num_gpus_per_node=*/4_p), }, { - make_machine_spec(/*num_nodes=*/1_n, - /*num_gpus_per_node=*/4_n), - make_machine_spec(/*num_nodes=*/1_n, - /*num_gpus_per_node=*/2_n), + make_machine_spec(/*num_nodes=*/1_p, + /*num_gpus_per_node=*/4_p), + make_machine_spec(/*num_nodes=*/1_p, + /*num_gpus_per_node=*/2_p), }, { - make_machine_spec(/*num_nodes=*/1_n, - /*num_gpus_per_node=*/5_n), - make_machine_spec(/*num_nodes=*/1_n, - /*num_gpus_per_node=*/1_n), + make_machine_spec(/*num_nodes=*/1_p, + /*num_gpus_per_node=*/5_p), + make_machine_spec(/*num_nodes=*/1_p, + /*num_gpus_per_node=*/1_p), }, }; } diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc index e506dea1d7..c3342c1b3a 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc @@ -53,7 +53,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*dimensions=*/ { MachineViewDimension{ - stride_t{1_n}, + stride_t{1_p}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -68,24 +68,24 @@ TEST_SUITE(FF_TEST_SUITE) { /*dimensions=*/ { MachineViewDimension{ - stride_t{2_n}, + stride_t{2_p}, MachineSpecificationDimension::INTRA_NODE, }, }, }; MachineSpecification full_machine_spec = MachineSpecification{ - /*num_nodes=*/2_n, - /*num_cpus_per_node=*/1_n, - /*num_gpus_per_node=*/1_n, + /*num_nodes=*/2_p, + /*num_cpus_per_node=*/1_p, + /*num_gpus_per_node=*/1_p, /*inter_node_bandwidth=*/1, /*intra_node_bandwidth=*/1, }; MachineSpecification split_machine_spec = MachineSpecification{ - /*num_nodes=*/1_n, - /*num_cpus_per_node=*/1_n, - /*num_gpus_per_node=*/1_n, + /*num_nodes=*/1_p, + /*num_cpus_per_node=*/1_p, + /*num_gpus_per_node=*/1_p, /*inter_node_bandwidth=*/1, /*intra_node_bandwidth=*/1, }; @@ -101,9 +101,9 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape tensor_shape = TensorShape{ TensorDims{ - FFOrdered{ - 10_n, - 8_n, + FFOrdered{ + 10_p, + 8_p, }, }, DataType::FLOAT, diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc b/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc index 51e6074bf2..c5b68e3a76 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc @@ -41,9 +41,9 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = TensorShape{ TensorDims{ - FFOrdered{ - 10_n, - 12_n, + FFOrdered{ + 10_p, + 12_p, }, }, DataType::FLOAT, @@ -56,7 +56,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*op_attrs=*/PCGOperatorAttrs{ RepartitionAttrs{ /*repartition_dim=*/ff_dim_t{0_n}, - /*repartition_degree=*/2_n, + /*repartition_degree=*/2_p, }, }, /*name=*/std::nullopt, @@ -106,7 +106,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*dimensions=*/ { MachineViewDimension{ - stride_t{1_n}, + stride_t{1_p}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -121,7 +121,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*dimensions=*/ { MachineViewDimension{ - stride_t{2_n}, + stride_t{2_p}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -136,7 +136,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*dimensions=*/ { MachineViewDimension{ - stride_t{3_n}, + stride_t{3_p}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -151,7 +151,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*dimensions=*/ { MachineViewDimension{ - stride_t{4_n}, + stride_t{4_p}, MachineSpecificationDimension::INTRA_NODE, }, }, diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc index e88b714bd4..928d30ecaa 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc @@ -16,7 +16,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*dimensions=*/ { MachineViewDimension{ - stride_t{1_n}, + stride_t{1_p}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -31,7 +31,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*dimensions=*/ { MachineViewDimension{ - stride_t{2_n}, + stride_t{2_p}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -62,7 +62,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*dimensions=*/ { MachineViewDimension{ - stride_t{1_n}, + stride_t{1_p}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -77,7 +77,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*dimensions=*/ { MachineViewDimension{ - stride_t{2_n}, + stride_t{2_p}, MachineSpecificationDimension::INTRA_NODE, }, }, diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc index 048f1ddcac..d2c829df30 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc @@ -65,9 +65,9 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = TensorShape{ TensorDims{ - FFOrdered{ - 10_n, - 1_n, + FFOrdered{ + 10_p, + 1_p, }, }, DataType::FLOAT, diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc index 4a261bcdae..c7a757d91f 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc @@ -15,7 +15,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*dimensions=*/ { MachineViewDimension{ - stride_t{1_n}, + stride_t{1_p}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -30,7 +30,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*dimensions=*/ { MachineViewDimension{ - stride_t{2_n}, + stride_t{2_p}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -196,7 +196,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*dimensions=*/ { MachineViewDimension{ - stride_t{1_n}, + stride_t{1_p}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -211,7 +211,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*dimensions=*/ { MachineViewDimension{ - stride_t{2_n}, + stride_t{2_p}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -319,7 +319,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*dimensions=*/ { MachineViewDimension{ - stride_t{1_n}, + stride_t{1_p}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -334,7 +334,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*dimensions=*/ { MachineViewDimension{ - stride_t{2_n}, + stride_t{2_p}, MachineSpecificationDimension::INTRA_NODE, }, }, diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc index 8ae1ebe753..22202c36f7 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc @@ -53,7 +53,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*dimensions=*/ { MachineViewDimension{ - stride_t{1_n}, + stride_t{1_p}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -68,24 +68,24 @@ TEST_SUITE(FF_TEST_SUITE) { /*dimensions=*/ { MachineViewDimension{ - stride_t{2_n}, + stride_t{2_p}, MachineSpecificationDimension::INTRA_NODE, }, }, }; MachineSpecification full_machine_spec = MachineSpecification{ - /*num_nodes=*/2_n, - /*num_cpus_per_node=*/1_n, - /*num_gpus_per_node=*/1_n, + /*num_nodes=*/2_p, + /*num_cpus_per_node=*/1_p, + /*num_gpus_per_node=*/1_p, /*inter_node_bandwidth=*/1, /*intra_node_bandwidth=*/1, }; MachineSpecification split_machine_spec = MachineSpecification{ - /*num_nodes=*/1_n, - /*num_cpus_per_node=*/1_n, - /*num_gpus_per_node=*/1_n, + /*num_nodes=*/1_p, + /*num_cpus_per_node=*/1_p, + /*num_gpus_per_node=*/1_p, /*inter_node_bandwidth=*/1, /*intra_node_bandwidth=*/1, }; @@ -101,9 +101,9 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape tensor_shape = TensorShape{ TensorDims{ - FFOrdered{ - 12_n, - 8_n, + FFOrdered{ + 12_p, + 8_p, }, }, DataType::FLOAT, diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc index 04149cae8f..35b55d2273 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc @@ -16,7 +16,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*dimensions=*/ { MachineViewDimension{ - stride_t{1_n}, + stride_t{1_p}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -31,7 +31,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*dimensions=*/ { MachineViewDimension{ - stride_t{2_n}, + stride_t{2_p}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -46,7 +46,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*dimensions=*/ { MachineViewDimension{ - stride_t{4_n}, + stride_t{4_p}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -166,7 +166,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*dimensions=*/ { MachineViewDimension{ - stride_t{1_n}, + stride_t{1_p}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -181,7 +181,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*dimensions=*/ { MachineViewDimension{ - stride_t{2_n}, + stride_t{2_p}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -367,7 +367,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*dimensions=*/ { MachineViewDimension{ - stride_t{1_n}, + stride_t{1_p}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -382,7 +382,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*dimensions=*/ { MachineViewDimension{ - stride_t{2_n}, + stride_t{2_p}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -499,7 +499,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*dimensions=*/ { MachineViewDimension{ - stride_t{1_n}, + stride_t{1_p}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -514,7 +514,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*dimensions=*/ { MachineViewDimension{ - stride_t{2_n}, + stride_t{2_p}, MachineSpecificationDimension::INTRA_NODE, }, }, @@ -529,7 +529,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*dimensions=*/ { MachineViewDimension{ - stride_t{4_n}, + stride_t{4_p}, MachineSpecificationDimension::INTRA_NODE, }, }, diff --git a/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc b/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc index fcd508828c..1c801161ca 100644 --- a/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc +++ b/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc @@ -37,9 +37,9 @@ TEST_SUITE(FF_TEST_SUITE) { InitializerAttrs zero_init = InitializerAttrs{ZeroInitializerAttrs{}}; TensorShape input_shape = TensorShape{TensorDims{ - FFOrdered{ - 10_n, - 12_n, + FFOrdered{ + 10_p, + 12_p, }, }, DataType::FLOAT}; @@ -62,7 +62,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("single operator plus inputs and weights") { LinearAttrs linear_attrs = LinearAttrs{ - /*out_channels=*/14_n, + /*out_channels=*/14_p, /*use_bias=*/true, /*data_type=*/DataType::FLOAT, /*activation=*/std::nullopt, @@ -126,7 +126,7 @@ TEST_SUITE(FF_TEST_SUITE) { // op1 op2 LinearAttrs linear_attrs = LinearAttrs{ - /*out_channels=*/14_n, + /*out_channels=*/14_p, /*use_bias=*/false, /*data_type=*/DataType::FLOAT, /*activation=*/std::nullopt, @@ -267,7 +267,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("real models") { SUBCASE("split_test") { ComputationGraph cg = - get_split_test_computation_graph(/*batch_size=*/8_n); + get_split_test_computation_graph(/*batch_size=*/8_p); std::optional sp_decomposition = get_computation_graph_series_parallel_decomposition(cg); @@ -340,14 +340,14 @@ TEST_SUITE(FF_TEST_SUITE) { ComputationGraphBuilder b; TensorShape input_shape = - TensorShape{TensorDims{FFOrdered{ - 10_n, - 12_n, + TensorShape{TensorDims{FFOrdered{ + 10_p, + 12_p, }}, DataType::FLOAT}; tensor_guid_t input = b.create_input(input_shape, CreateGrad::YES); - b.dense(input, /*outDim=*/14_n); + b.dense(input, /*outDim=*/14_p); return b.computation_graph; }(); @@ -358,7 +358,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("split_test") { ComputationGraph cg = - get_split_test_computation_graph(/*batch_size=*/8_n); + get_split_test_computation_graph(/*batch_size=*/8_p); std::string result = render_preprocessed_computation_graph_for_sp_decomposition(cg); diff --git a/lib/compiler/test/src/compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.cc b/lib/compiler/test/src/compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.cc index 06664b38fa..13f15f6db3 100644 --- a/lib/compiler/test/src/compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.cc +++ b/lib/compiler/test/src/compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.cc @@ -22,9 +22,9 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(result == correct); } - TensorShape input_shape = TensorShape{TensorDims{FFOrdered{ - 10_n, - 12_n, + TensorShape input_shape = TensorShape{TensorDims{FFOrdered{ + 10_p, + 12_p, }}, DataType::FLOAT}; InitializerAttrs zero_init = InitializerAttrs{ZeroInitializerAttrs{}}; @@ -58,7 +58,7 @@ TEST_SUITE(FF_TEST_SUITE) { parallel_tensor_guid_t t_input = get_only(input_added.outputs); LinearAttrs linear_attrs = LinearAttrs{ - /*out_channels=*/14_n, + /*out_channels=*/14_p, /*use_bias=*/true, /*data_type=*/DataType::FLOAT, /*activation=*/Activation::RELU, @@ -133,7 +133,7 @@ TEST_SUITE(FF_TEST_SUITE) { }; LinearAttrs linear_attrs = LinearAttrs{ - /*out_channels=*/14_n, + /*out_channels=*/14_p, /*use_bias=*/false, /*data_type=*/DataType::FLOAT, /*activation=*/std::nullopt, @@ -204,9 +204,9 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelComputationGraph pcg = empty_parallel_computation_graph(); TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ - 12_n, - 10_n, + TensorDims{FFOrdered{ + 12_p, + 10_p, }}, DataType::FLOAT, }; @@ -218,7 +218,7 @@ TEST_SUITE(FF_TEST_SUITE) { RepartitionAttrs p2_attrs = RepartitionAttrs{ /*repartition_dim=*/ff_dim_t{0_n}, - /*repartition_degree=*/3_n, + /*repartition_degree=*/3_p, }; ParallelLayerAddedResult p2_added = add_parallel_layer(pcg, make_layer_attrs(p2_attrs), {t_input}, {}); @@ -227,7 +227,7 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelLayerAttrs p3_attrs = ParallelLayerAttrs{ PCGOperatorAttrs{RepartitionAttrs{ /*repartition_dim=*/ff_dim_t{1_n}, - /*repartition_degree=*/2_n, + /*repartition_degree=*/2_p, }}, /*name=*/std::nullopt, }; @@ -243,8 +243,8 @@ TEST_SUITE(FF_TEST_SUITE) { parallel_tensor_guid_t t_op0 = get_only(op0_added.outputs); EmbeddingAttrs op1_attrs = EmbeddingAttrs{ - /*num_entires=*/100_n, - /*out_channels=*/22_n, + /*num_entires=*/100_p, + /*out_channels=*/22_p, /*aggr=*/AggregateOp::SUM, /*data_type=*/DataType::FLOAT, }; @@ -262,7 +262,7 @@ TEST_SUITE(FF_TEST_SUITE) { parallel_tensor_guid_t t_w1 = get_only(w1_added.outputs); ReplicateAttrs p1_attrs = ReplicateAttrs{ - /*replicate_degree=*/6_n, + /*replicate_degree=*/6_p, }; ParallelLayerAddedResult p1_added = add_parallel_layer(pcg, make_layer_attrs(p1_attrs), {t_w1}, {}); @@ -272,7 +272,7 @@ TEST_SUITE(FF_TEST_SUITE) { add_parallel_layer(pcg, make_layer_attrs(op1_attrs), {t_op0}, {t_p1}); LinearAttrs op2_attrs = LinearAttrs{ - /*out_channels=*/14_n, + /*out_channels=*/14_p, /*use_bias=*/false, /*data_type=*/DataType::FLOAT, /*activation=*/std::nullopt, @@ -289,7 +289,7 @@ TEST_SUITE(FF_TEST_SUITE) { parallel_tensor_guid_t t_w2 = get_only(w2_added.outputs); ReplicateAttrs p4_attrs = ReplicateAttrs{ - /*replicate_degree=*/3_n, + /*replicate_degree=*/3_p, }; ParallelLayerAddedResult p4_added = add_parallel_layer(pcg, make_layer_attrs(p4_attrs), {t_w2}, {}); @@ -297,7 +297,7 @@ TEST_SUITE(FF_TEST_SUITE) { RepartitionAttrs p5_attrs = RepartitionAttrs{ /*repartition_dim=*/ff_dim_t{0_n}, - /*repartition_degree=*/2_n, + /*repartition_degree=*/2_p, }; ParallelLayerAddedResult p5_added = add_parallel_layer(pcg, make_layer_attrs(p5_attrs), {t_p4}, {}); diff --git a/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc b/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc index f320e45d06..c3c83dd6b8 100644 --- a/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc +++ b/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc @@ -38,9 +38,9 @@ namespace FlexFlow { TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("task_simulator_estimate_forward_pass_time") { MachineSpecification machine_spec = - MachineSpecification{/*num_nodes=*/3_n, - /*num_cpus_per_node=*/3_n, - /*num_gpus_per_node=*/3_n, + MachineSpecification{/*num_nodes=*/3_p, + /*num_cpus_per_node=*/3_p, + /*num_gpus_per_node=*/3_p, /*inter_node_bandwidth=*/1.0f, /*intra_node_bandwidth=*/1.0f}; @@ -48,9 +48,9 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelComputationGraphBuilder b; TensorShape input_shape = TensorShape{ TensorDims{ - FFOrdered{ - 10_n, - 7_n, + FFOrdered{ + 10_p, + 7_p, }, }, DataType::FLOAT, @@ -62,13 +62,13 @@ TEST_SUITE(FF_TEST_SUITE) { parallel_layer_guid_t layer1 = get_source_layer(tensor1); std::vector dims = { - MachineViewDimension{stride_t{1_n}, + MachineViewDimension{stride_t{1_p}, MachineSpecificationDimension::INTER_NODE}, - MachineViewDimension{stride_t{1_n}, + MachineViewDimension{stride_t{1_p}, MachineSpecificationDimension::INTER_NODE}, - MachineViewDimension{stride_t{1_n}, + MachineViewDimension{stride_t{1_p}, MachineSpecificationDimension::INTER_NODE}, - MachineViewDimension{stride_t{1_n}, + MachineViewDimension{stride_t{1_p}, MachineSpecificationDimension::INTER_NODE}, }; ParallelComputationGraph pcg = b.pcg; @@ -127,9 +127,9 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = TensorShape{ TensorDims{ - FFOrdered{ - 10_n, - 1_n, + FFOrdered{ + 10_p, + 1_p, }, }, DataType::FLOAT, @@ -147,13 +147,13 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelComputationGraph pcg = b.pcg; std::vector dims = { - MachineViewDimension{stride_t{1_n}, + MachineViewDimension{stride_t{1_p}, MachineSpecificationDimension::INTER_NODE}, - MachineViewDimension{stride_t{1_n}, + MachineViewDimension{stride_t{1_p}, MachineSpecificationDimension::INTER_NODE}, - MachineViewDimension{stride_t{1_n}, + MachineViewDimension{stride_t{1_p}, MachineSpecificationDimension::INTER_NODE}, - MachineViewDimension{stride_t{1_n}, + MachineViewDimension{stride_t{1_p}, MachineSpecificationDimension::INTER_NODE}, }; diff --git a/lib/compiler/test/src/graph_optimize_state.cc b/lib/compiler/test/src/graph_optimize_state.cc index 5c00ce1558..e7060ef421 100644 --- a/lib/compiler/test/src/graph_optimize_state.cc +++ b/lib/compiler/test/src/graph_optimize_state.cc @@ -8,25 +8,13 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("GraphOptimizeState::operator==") { TensorShape input_shape = TensorShape{ TensorDims{ - FFOrdered{ - 32_n, - 16_n, + FFOrdered{ + 32_p, + 16_p, }, }, DataType::FLOAT, }; - // ParallelTensorShape input_shape = - // ParallelTensorShape{ParallelTensorDims{ - // FFOrdered{ - // ShardParallelDim{32_n, 2_n}, - // ShardParallelDim{16_n, 1_n}, - // }, - // ReplicaParallelDimSet{ - // SumDegree{1_n}, - // DiscardCopyDegree{1_n}, - // }, - // }, - // DataType::FLOAT}; // `machine_mapping` is determined by the PCG and the device mapping // algorithm, and `runtime` is determined by the PCG and the device mapping, @@ -43,7 +31,7 @@ TEST_SUITE(FF_TEST_SUITE) { builder.create_input_tensor(input_shape, "input0"); parallel_tensor_guid_t dense0 = builder.dense(/*input=*/input0, - /*outDim=*/8_n, + /*outDim=*/8_p, /*activation=*/Activation::RELU, /*use_bias=*/true, /*data_type=*/DataType::FLOAT, @@ -53,7 +41,7 @@ TEST_SUITE(FF_TEST_SUITE) { parallel_tensor_guid_t dense1 = builder.dense(/*input=*/dense0, - /*outDim=*/4_n, + /*outDim=*/4_p, /*activation=*/Activation::RELU, /*use_bias=*/true, /*data_type=*/DataType::FLOAT, @@ -89,7 +77,7 @@ TEST_SUITE(FF_TEST_SUITE) { builder_.create_input_tensor(input_shape, "input0"); parallel_tensor_guid_t dense0_ = builder_.dense(/*input=*/input0_, - /*outDim=*/8_n, + /*outDim=*/8_p, /*activation=*/Activation::RELU, /*use_bias=*/true, /*data_type=*/DataType::FLOAT, diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index 431facd6c1..c24695298b 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -7,7 +7,6 @@ #include "op-attrs/datatype.h" #include "pcg/device_type.dtg.h" #include "utils/containers/transform.h" -#include "utils/required.h" #include namespace FlexFlow { @@ -154,8 +153,6 @@ class GenericTensorAccessorW { std::string format_as(GenericTensorAccessorW const &); std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &); -static_assert(is_fmtable const &>::value, ""); - template typename data_type_enum_to_class
::type * get(GenericTensorAccessorW const &a) { @@ -245,6 +242,14 @@ std::pair void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor, GenericTensorAccessorR const &src_accessor); +template +real_type_t
accessor_get_only_value(GenericTensorAccessorR const &acc) { + ASSERT(get_num_elements(acc.shape) == 1); + ASSERT(acc.data_type == DT); + + return *static_cast const *>(acc.ptr); +} + } // namespace FlexFlow namespace FlexFlow { diff --git a/lib/kernels/include/kernels/array_coord.h b/lib/kernels/include/kernels/array_coord.h index f739a3d707..84e68fa053 100644 --- a/lib/kernels/include/kernels/array_coord.h +++ b/lib/kernels/include/kernels/array_coord.h @@ -5,7 +5,7 @@ namespace FlexFlow { -ArrayCoord array_coord_drop_dims(ArrayCoord const &, +ArrayCoord array_coord_drop_dims(ArrayCoord const &coord, std::function const &should_drop_dim); } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h index 25ef8116f2..355b6e5bca 100644 --- a/lib/kernels/include/kernels/array_shape.h +++ b/lib/kernels/include/kernels/array_shape.h @@ -4,7 +4,7 @@ #include "kernels/array_coord.dtg.h" #include "kernels/legion_dim.h" #include "op-attrs/tensor_shape.dtg.h" -#include "utils/nonnegative_int/nonnegative_int.h" +#include "utils/positive_int/positive_int.h" #include "utils/stack_vector/stack_vector.h" #include "utils/visitable.h" #include @@ -16,25 +16,15 @@ namespace FlexFlow { struct ArrayShape { public: ArrayShape() = delete; - explicit ArrayShape(LegionOrdered const &dims); + explicit ArrayShape(LegionOrdered const &dims); - /** - * @brief Alias of ArrayShape::num_elements for compatibility with - * Legion::Domain - */ - nonnegative_int get_volume() const; + positive_int num_elements() const; - /** - * @brief Alias of ArrayShape::num_dims for compatibility with Legion::Domain - */ - nonnegative_int get_dim() const; - - nonnegative_int num_elements() const; nonnegative_int num_dims() const; - nonnegative_int operator[](legion_dim_t) const; - nonnegative_int at(legion_dim_t) const; - nonnegative_int at(ff_dim_t) const; + positive_int operator[](legion_dim_t) const; + positive_int at(legion_dim_t) const; + positive_int at(ff_dim_t) const; bool operator==(ArrayShape const &) const; bool operator!=(ArrayShape const &) const; @@ -42,8 +32,8 @@ struct ArrayShape { legion_dim_t last_idx() const; legion_dim_t neg_idx(int) const; - std::optional at_maybe(legion_dim_t) const; - std::optional at_maybe(ff_dim_t) const; + std::optional at_maybe(legion_dim_t) const; + std::optional at_maybe(ff_dim_t) const; ArrayShape sub_shape(ff_dim_t const &start, std::optional const &end) const; @@ -52,7 +42,7 @@ struct ArrayShape { std::optional const &end) const; public: - LegionOrdered dims; + LegionOrdered dims; private: std::tuple tie() const; @@ -63,13 +53,17 @@ struct ArrayShape { std::string format_as(ArrayShape const &); std::ostream &operator<<(std::ostream &, ArrayShape const &); -nonnegative_int get_volume(ArrayShape const &); +positive_int get_num_elements(ArrayShape const &); ArrayShape array_shape_from_tensor_shape(TensorShape const &); TensorShape get_tensor_shape(ArrayShape const &, DataType); +std::unordered_set get_ff_dim_t_set(ArrayShape const &); std::unordered_set get_array_coord_set(ArrayShape const &); +ArrayShape array_shape_drop_dims(ArrayShape const &shape, + std::function const &should_drop_dim); + } // namespace FlexFlow namespace std { diff --git a/lib/kernels/include/kernels/create_accessor_with_contents.h b/lib/kernels/include/kernels/create_accessor_with_contents.h index fc07d432b2..966a7a30ad 100644 --- a/lib/kernels/include/kernels/create_accessor_with_contents.h +++ b/lib/kernels/include/kernels/create_accessor_with_contents.h @@ -5,6 +5,7 @@ #include "kernels/allocation.h" #include "kernels/local_cpu_allocator.h" #include "utils/containers/require_all_same1.h" +#include "utils/nonnegative_int/nonnegative_range.h" namespace FlexFlow { @@ -12,8 +13,7 @@ template GenericTensorAccessorW create_1d_accessor_w_with_contents(std::vector const &contents, Allocator &allocator) { - nonnegative_int ncols = num_elements(contents); - ASSERT(ncols > 0); + positive_int ncols = positive_int{num_elements(contents)}; TensorShape shape = TensorShape{ TensorDims{FFOrdered{ncols}}, @@ -23,7 +23,7 @@ GenericTensorAccessorW Allocator cpu_allocator = create_local_cpu_memory_allocator(); GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape); - for (nonnegative_int col_idx : nonnegative_range(ncols)) { + for (nonnegative_int col_idx : nonnegative_range(ncols.nonnegative_int_from_positive_int())) { cpu_accessor.at>(FFOrdered{col_idx}) = contents.at(col_idx.unwrap_nonnegative()); } @@ -38,14 +38,12 @@ GenericTensorAccessorW template GenericTensorAccessorW create_2d_accessor_w_with_contents( std::vector> const &contents, Allocator &allocator) { - nonnegative_int nrows = num_elements(contents); - ASSERT(nrows > 0); + positive_int nrows = positive_int{num_elements(contents)}; - nonnegative_int ncols = throw_if_unexpected( + positive_int ncols = throw_if_unexpected( require_all_same1(transform(contents, [](std::vector const &row) { - return num_elements(row); + return positive_int{num_elements(row)}; }))); - ASSERT(ncols > 0); TensorShape shape = TensorShape{ TensorDims{FFOrdered{nrows, ncols}}, @@ -55,8 +53,8 @@ GenericTensorAccessorW create_2d_accessor_w_with_contents( Allocator cpu_allocator = create_local_cpu_memory_allocator(); GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape); - for (nonnegative_int row_idx : nonnegative_range(nrows)) { - for (nonnegative_int col_idx : nonnegative_range(ncols)) { + for (nonnegative_int row_idx : nonnegative_range(nrows.nonnegative_int_from_positive_int())) { + for (nonnegative_int col_idx : nonnegative_range(ncols.nonnegative_int_from_positive_int())) { cpu_accessor.at>(FFOrdered{row_idx, col_idx}) = contents.at(row_idx.unwrap_nonnegative()) .at(col_idx.unwrap_nonnegative()); @@ -74,23 +72,20 @@ template GenericTensorAccessorW create_3d_accessor_w_with_contents( std::vector>> const &contents, Allocator &allocator) { - nonnegative_int dim0_size = num_elements(contents); - ASSERT(dim0_size > 0); + positive_int dim0_size = positive_int{num_elements(contents)}; - nonnegative_int dim1_size = throw_if_unexpected(require_all_same1( + positive_int dim1_size = throw_if_unexpected(require_all_same1( transform(contents, [](std::vector> const &m) { - return num_elements(m); + return positive_int{num_elements(m)}; }))); - ASSERT(dim1_size > 0); - nonnegative_int dim2_size = throw_if_unexpected(require_all_same1( + positive_int dim2_size = throw_if_unexpected(require_all_same1( transform(contents, [](std::vector> const &m) { return throw_if_unexpected( require_all_same1(transform(m, [](std::vector const &vec) { - return num_elements(vec); + return positive_int{num_elements(vec)}; }))); }))); - ASSERT(dim2_size > 0); TensorShape shape = TensorShape{ TensorDims{FFOrdered{dim0_size, dim1_size, dim2_size}}, @@ -100,9 +95,9 @@ GenericTensorAccessorW create_3d_accessor_w_with_contents( Allocator cpu_allocator = create_local_cpu_memory_allocator(); GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape); - for (nonnegative_int dim0_idx : nonnegative_range(dim0_size)) { - for (nonnegative_int dim1_idx : nonnegative_range(dim1_size)) { - for (nonnegative_int dim2_idx : nonnegative_range(dim2_size)) { + for (nonnegative_int dim0_idx : nonnegative_range(dim0_size.nonnegative_int_from_positive_int())) { + for (nonnegative_int dim1_idx : nonnegative_range(dim1_size.nonnegative_int_from_positive_int())) { + for (nonnegative_int dim2_idx : nonnegative_range(dim2_size.nonnegative_int_from_positive_int())) { cpu_accessor.at>( FFOrdered{dim0_idx, dim1_idx, dim2_idx}) = contents.at(dim0_idx.unwrap_nonnegative()) @@ -123,35 +118,31 @@ template GenericTensorAccessorW create_4d_accessor_w_with_contents( std::vector>>> const &contents, Allocator &allocator) { - nonnegative_int dim0_size = num_elements(contents); - ASSERT(dim0_size > 0); + positive_int dim0_size = positive_int{num_elements(contents)}; - nonnegative_int dim1_size = throw_if_unexpected(require_all_same1(transform( + positive_int dim1_size = throw_if_unexpected(require_all_same1(transform( contents, [](std::vector>> const &t) { - return num_elements(t); + return positive_int{num_elements(t)}; }))); - ASSERT(dim1_size > 0); - nonnegative_int dim2_size = throw_if_unexpected(require_all_same1(transform( + positive_int dim2_size = throw_if_unexpected(require_all_same1(transform( contents, [](std::vector>> const &m) { return throw_if_unexpected(require_all_same1( transform(m, [](std::vector> const &vec) { - return num_elements(vec); + return positive_int{num_elements(vec)}; }))); }))); - ASSERT(dim2_size > 0); - nonnegative_int dim3_size = throw_if_unexpected(require_all_same1(transform( + positive_int dim3_size = throw_if_unexpected(require_all_same1(transform( contents, [](std::vector>> const &t) { return throw_if_unexpected(require_all_same1( transform(t, [](std::vector> const &mat) { return throw_if_unexpected(require_all_same1( transform(mat, [](std::vector const &vec) { - return num_elements(vec); + return positive_int{num_elements(vec)}; }))); }))); }))); - ASSERT(dim3_size > 0); TensorShape shape = TensorShape{ TensorDims{FFOrdered{dim0_size, dim1_size, dim2_size, dim3_size}}, @@ -160,10 +151,10 @@ GenericTensorAccessorW create_4d_accessor_w_with_contents( GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - for (nonnegative_int dim0_idx : nonnegative_range(dim0_size)) { - for (nonnegative_int dim1_idx : nonnegative_range(dim1_size)) { - for (nonnegative_int dim2_idx : nonnegative_range(dim2_size)) { - for (nonnegative_int dim3_idx : nonnegative_range(dim3_size)) { + for (nonnegative_int dim0_idx : nonnegative_range(dim0_size.nonnegative_int_from_positive_int())) { + for (nonnegative_int dim1_idx : nonnegative_range(dim1_size.nonnegative_int_from_positive_int())) { + for (nonnegative_int dim2_idx : nonnegative_range(dim2_size.nonnegative_int_from_positive_int())) { + for (nonnegative_int dim3_idx : nonnegative_range(dim3_size.nonnegative_int_from_positive_int())) { accessor.at>( FFOrdered{dim0_idx, dim1_idx, dim2_idx, dim3_idx}) = contents.at(dim0_idx.unwrap_nonnegative()) diff --git a/lib/kernels/include/kernels/fill_tensor_accessor.h b/lib/kernels/include/kernels/fill_tensor_accessor.h new file mode 100644 index 0000000000..8db63f5a2d --- /dev/null +++ b/lib/kernels/include/kernels/fill_tensor_accessor.h @@ -0,0 +1,22 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_FILL_TENSOR_ACCESSOR_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_FILL_TENSOR_ACCESSOR_H + +#include "kernels/accessor.h" +#include "kernels/allocation.h" +#include "op-attrs/datatype_value.dtg.h" + +namespace FlexFlow { + +void fill_tensor_accessor(GenericTensorAccessorW &, DataTypeValue val); + +GenericTensorAccessorW create_accessor_w_filled_with(TensorShape const &shape, + DataTypeValue val, + Allocator const &allocator); + +GenericTensorAccessorR create_accessor_r_filled_with(TensorShape const &shape, + DataTypeValue val, + Allocator const &allocator); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/legion_dim.h b/lib/kernels/include/kernels/legion_dim.h index 9a47c8a0fe..63c6ddb3c6 100644 --- a/lib/kernels/include/kernels/legion_dim.h +++ b/lib/kernels/include/kernels/legion_dim.h @@ -7,8 +7,9 @@ #include "op-attrs/ff_ordered/ff_ordered.h" #include "utils/containers/set_of.h" #include "utils/containers/transform.h" -#include "utils/nonnegative_int/nonnegative_range.h" +#include "utils/positive_int/positive_int.h" #include "utils/nonnegative_int/num_elements.h" +#include "utils/nonnegative_int/nonnegative_range.h" namespace FlexFlow { diff --git a/lib/kernels/include/kernels/map_tensor_accessors.h b/lib/kernels/include/kernels/map_tensor_accessors.h index 8447c60892..eed17cbb61 100644 --- a/lib/kernels/include/kernels/map_tensor_accessors.h +++ b/lib/kernels/include/kernels/map_tensor_accessors.h @@ -8,9 +8,32 @@ #include "kernels/datatype_dispatch.h" #include "utils/containers/require_same.h" #include "utils/containers/require_all_same1.h" +#include namespace FlexFlow { +template +struct CPUMapTensorAccessorInPlace { + template + void operator()(GenericTensorAccessorW &accessor, + F &&f) { + ASSERT(accessor.device_type == DeviceType::CPU); + + for (ArrayCoord const &coord : get_array_coord_set(accessor.shape)) { + accessor.at
(coord.ff_ordered) + = f(accessor.at
(coord.ff_ordered)); + } + } +}; + +template +void map_tensor_accessor_inplace(GenericTensorAccessorW &accessor, + F &&f) { + ASSERT(accessor.device_type == DeviceType::CPU); + + DataTypeDispatch1{}(accessor.data_type, accessor, f); +} + template struct CPUMapTensorAccessor { template @@ -23,7 +46,9 @@ struct CPUMapTensorAccessor { ASSERT(output.device_type == DeviceType::CPU); for (ArrayCoord const &coord : get_array_coord_set(shape)) { - output.at(coord.ff_ordered) + output.at< + type_to_data_type_enum_v>> + >(coord.ff_ordered) = f(input.at
(coord.ff_ordered)); } } @@ -31,8 +56,8 @@ struct CPUMapTensorAccessor { template > GenericTensorAccessorW map_tensor_accessor(GenericTensorAccessorR const &input, - Allocator &output_allocator, - F &&f) { + F &&f, + Allocator &output_allocator) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); GenericTensorAccessorR input_cpu = copy_tensor_accessor_r_to_cpu_if_necessary(input, cpu_allocator); @@ -43,9 +68,12 @@ GenericTensorAccessorW map_tensor_accessor(GenericTensorAccessorR const &input, return copy_tensor_accessor_w(output_cpu, output_allocator); } -template +template struct CPUMapTensorAccessors2 { - template > + template < + typename F, + typename Out = std::invoke_result_t, real_type_t> + > void operator()(GenericTensorAccessorR const &lhs, GenericTensorAccessorR const &rhs, GenericTensorAccessorW &output, @@ -63,26 +91,25 @@ struct CPUMapTensorAccessors2 { for (ArrayCoord const &coord : get_array_coord_set(shape)) { output.at>(coord.ff_ordered) - = f(lhs.at
(coord.ff_ordered), rhs.at
(coord.ff_ordered)); + = f(lhs.at(coord.ff_ordered), rhs.at(coord.ff_ordered)); } } }; -template > +template GenericTensorAccessorW map_tensor_accessors2(GenericTensorAccessorR const &lhs, GenericTensorAccessorR const &rhs, - Allocator &output_allocator, - F &&f) { + DataType output_data_type, + F &&f, + Allocator &output_allocator) { ArrayShape shape = require_same(lhs.shape, rhs.shape); - DataType input_data_type = require_same(lhs.data_type, rhs.data_type); Allocator cpu_allocator = create_local_cpu_memory_allocator(); GenericTensorAccessorR lhs_cpu = copy_tensor_accessor_r_to_cpu_if_necessary(lhs, cpu_allocator); GenericTensorAccessorR rhs_cpu = copy_tensor_accessor_r_to_cpu_if_necessary(rhs, cpu_allocator); - DataType output_data_type = type_to_data_type_enum_v; GenericTensorAccessorW output_cpu = cpu_allocator.allocate_tensor(get_tensor_shape(shape, output_data_type)); - DataTypeDispatch1{}(input_data_type, lhs_cpu, rhs_cpu, output_cpu, f); + DataTypeDispatch2{}(lhs.data_type, rhs.data_type, lhs_cpu, rhs_cpu, output_cpu, f); return copy_tensor_accessor_w(output_cpu, output_allocator); } diff --git a/lib/kernels/include/kernels/optimizer_kernels.h b/lib/kernels/include/kernels/optimizer_kernels.h index 39284b4a6f..51e6f8640f 100644 --- a/lib/kernels/include/kernels/optimizer_kernels.h +++ b/lib/kernels/include/kernels/optimizer_kernels.h @@ -43,18 +43,20 @@ void adam_ps_update_task_gpu(ffStream_t, float *adam_v_ptr, float *adam_m_ptr); +#ifdef FF_USE_NCCL void adam_nccl_update_task_gpu(ffStream_t, float alpha_t, float beta1, float beta2, float weight_decay, float epsilon, - size_t size, PerDeviceFFHandle const &, float const *weight_grad_ptr, + size_t size, float *weight_ptr, float *adam_v_ptr, float *adam_m_ptr); +#endif } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/reduce_tensor_accessor.h b/lib/kernels/include/kernels/reduce_tensor_accessor.h new file mode 100644 index 0000000000..4be375299f --- /dev/null +++ b/lib/kernels/include/kernels/reduce_tensor_accessor.h @@ -0,0 +1,88 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REDUCE_TENSOR_ACCESSOR_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REDUCE_TENSOR_ACCESSOR_H + +#include "kernels/accessor.h" +#include "kernels/allocation.h" +#include "kernels/array_coord.h" +#include "utils/containers/contains.h" +#include "utils/containers/sorted.h" +#include "utils/containers/group_by.h" +#include "utils/containers/transform.h" +#include "utils/containers/foldl1.h" +#include "utils/containers/foldr1.h" +#include "kernels/local_cpu_allocator.h" +#include "kernels/copy_tensor_accessor.h" +#include "kernels/datatype_dispatch.h" + +namespace FlexFlow { + +template +struct CPUReduceTensorAccessorInDims { + template + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW &output, + std::unordered_set const &dims_to_reduce, + F &&f) { + using T = real_type_t
; + + ASSERT(input.device_type == DeviceType::CPU); + ASSERT(output.device_type == DeviceType::CPU); + + auto should_drop_dim = [&](ff_dim_t dim) -> bool { + return contains(dims_to_reduce, dim); + }; + + std::unordered_map> output_coord_from_input_coord + = group_by(get_array_coord_set(input.shape), + [&](ArrayCoord const &input_coord) { return array_coord_drop_dims(input_coord, should_drop_dim); }); + + for (auto const &[output_coord, input_coords] : output_coord_from_input_coord) { + std::vector input_values = transform(sorted(input_coords), + [&](ArrayCoord const &input_coord) -> T { + return input.at
(input_coord.ff_ordered); + }); + + T result = foldl1(input_values, f); + ASSERT(result == foldr1(input_values, [&](T const &accum, T const &elem) { return f(elem, accum); })); + + output.at
(output_coord.ff_ordered) = result; + } + } +}; + +template +GenericTensorAccessorW reduce_tensor_accessor_in_dims( + GenericTensorAccessorR const &input, + std::unordered_set const &dims, + Allocator &output_allocator, + F &&f) { + + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR input_cpu = copy_tensor_accessor_r_to_cpu_if_necessary(input, cpu_allocator); + + auto should_drop_dim = [&](ff_dim_t dim) -> bool { + return contains(dims, dim); + }; + + ArrayShape reduced_shape = array_shape_drop_dims(input.shape, should_drop_dim); + GenericTensorAccessorW output_cpu = cpu_allocator.allocate_tensor(get_tensor_shape(reduced_shape, input.data_type)); + + DataTypeDispatch1{}(input_cpu.data_type, input_cpu, output_cpu, dims, f); + + return copy_tensor_accessor_w(output_cpu, output_allocator); +} + +template +real_type_t
reduce_tensor_accessor_in_all_dims(GenericTensorAccessorR const &input, + F &&f) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + std::unordered_set input_dims = get_ff_dim_t_set(input.shape); + GenericTensorAccessorW reduced = reduce_tensor_accessor_in_dims(input, input_dims, cpu_allocator, f); + + return accessor_get_only_value
(reduced); +} + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/reduce_tensor_accessors.h b/lib/kernels/include/kernels/reduce_tensor_accessors.h deleted file mode 100644 index c80c41778f..0000000000 --- a/lib/kernels/include/kernels/reduce_tensor_accessors.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REDUCE_TENSOR_ACCESSORS_H -#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REDUCE_TENSOR_ACCESSORS_H - -#include "kernels/accessor.h" -#include "kenrels/allocation.h" - -namespace FlexFlow { - - - -template -struct CPUReduceTensorAccessorInDims { - template - void operator()(GenericTensorAccessorR const &input, - GenericTensorAccessorW &output, - std::unordered_set const &dims_to_reduce, - F &&f) { - - ASSERT(input.device_type == DeviceType::CPU); - ASSERT(output.device_type == DeviceType::CPU); - - for (ArrayCoord const &coord : get_array_coord_set(input.shape)) { - output.at>(coord) - } - } -}; - -template -GenericTensorAccessorW reduce_tensor_accessor_in_dims(std::unordered_set const &dims, - F &&f) { - -} - -GenericTensorAccessorW reduce_tensor_accessor_all(GenericTensorAcessorR const &input, - Allocator &allocator); - -} // namespace FlexFlow - -#endif diff --git a/lib/kernels/include/kernels/reverse_kernels_params.struct.toml b/lib/kernels/include/kernels/reverse_kernels_params.struct.toml index a5dbd750bc..1689594491 100644 --- a/lib/kernels/include/kernels/reverse_kernels_params.struct.toml +++ b/lib/kernels/include/kernels/reverse_kernels_params.struct.toml @@ -8,21 +8,21 @@ features = [ ] includes = [ - "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", ] [[fields]] name = "num_out_blks" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "reverse_dim_size" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "in_blk_size" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "out_size" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" diff --git a/lib/kernels/include/kernels/tensor_accessor_reductions.h b/lib/kernels/include/kernels/tensor_accessor_reductions.h new file mode 100644 index 0000000000..03502b6943 --- /dev/null +++ b/lib/kernels/include/kernels/tensor_accessor_reductions.h @@ -0,0 +1,13 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_TENSOR_ACCESSOR_REDUCTIONS_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_TENSOR_ACCESSOR_REDUCTIONS_H + +#include "kernels/accessor.h" + +namespace FlexFlow { + +bool tensor_accessor_all(GenericTensorAccessorR const &); +bool tensor_accessor_any(GenericTensorAccessorR const &); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/src/cpu/ops/cast_kernels.cc b/lib/kernels/src/cpu/ops/cast_kernels.cc index cdd57b8947..08a98f165b 100644 --- a/lib/kernels/src/cpu/ops/cast_kernels.cc +++ b/lib/kernels/src/cpu/ops/cast_kernels.cc @@ -21,7 +21,7 @@ template struct CPUForwardKernel { void operator()(GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { - size_t volume = input.shape.get_volume().unwrap_nonnegative(); + size_t volume = input.shape.num_elements().int_from_positive_int(); cpu_cast_forward(input.get(), output.get(), volume); } }; @@ -30,7 +30,7 @@ template struct CPUBackwardKernel { void operator()(GenericTensorAccessorR const &output, GenericTensorAccessorW const &input) { - size_t volume = output.shape.get_volume().unwrap_nonnegative(); + size_t volume = output.shape.num_elements().int_from_positive_int(); cpu_cast_backward( output.get(), input.get(), volume, cast_to(1.0f)); } diff --git a/lib/kernels/src/cpu/ops/combine_kernels.cc b/lib/kernels/src/cpu/ops/combine_kernels.cc index 577984f21a..557f523f17 100644 --- a/lib/kernels/src/cpu/ops/combine_kernels.cc +++ b/lib/kernels/src/cpu/ops/combine_kernels.cc @@ -9,8 +9,8 @@ struct CPUForwardKernel { GenericTensorAccessorW const &output) { memcpy(output.get
(), input.get
(), - input.shape.get_volume().unwrap_nonnegative() * - size_of_datatype(DT).unwrap_nonnegative()); + input.shape.num_elements().int_from_positive_int() * + size_of_datatype(DT).int_from_positive_int()); } }; @@ -18,7 +18,7 @@ template struct CPUBackwardKernel { void operator()(GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &input_grad) { - size_t num_elements = output_grad.shape.get_volume().unwrap_nonnegative(); + size_t num_elements = output_grad.shape.num_elements().int_from_positive_int(); for (int i = 0; i < num_elements; ++i) { input_grad.get
()[i] += output_grad.get
()[i]; } diff --git a/lib/kernels/src/cpu/ops/initializer_kernels.cc b/lib/kernels/src/cpu/ops/initializer_kernels.cc index 91f4f46ef8..c7f43b5762 100644 --- a/lib/kernels/src/cpu/ops/initializer_kernels.cc +++ b/lib/kernels/src/cpu/ops/initializer_kernels.cc @@ -9,7 +9,7 @@ template struct ZeroInitKernel { void operator()(GenericTensorAccessorW const &tensor) const { auto arr = get
(tensor); - for (size_t i = 0; i < get_volume(tensor.shape); i++) { + for (size_t i = 0; i < get_num_elements(tensor.shape); i++) { arr[i] = 0.0f; } } @@ -25,7 +25,7 @@ struct ConstantInitKernel { DataTypeValue value) const { auto arr = get
(tensor); auto unwrapped_value = value.get>(); - for (size_t i = 0; i < get_volume(tensor.shape); i++) { + for (size_t i = 0; i < get_num_elements(tensor.shape); i++) { arr[i] = unwrapped_value; } } diff --git a/lib/kernels/src/cpu/ops/replicate_kernels.cc b/lib/kernels/src/cpu/ops/replicate_kernels.cc index 798a4ea8c7..d97a274d80 100644 --- a/lib/kernels/src/cpu/ops/replicate_kernels.cc +++ b/lib/kernels/src/cpu/ops/replicate_kernels.cc @@ -1,5 +1,6 @@ #include "kernels/datatype_dispatch.h" #include "kernels/replicate_kernels_cpu.h" +#include "utils/nonnegative_int/nonnegative_range.h" namespace FlexFlow::Kernels::Replicate { @@ -9,8 +10,8 @@ struct CPUForwardKernel { GenericTensorAccessorW &output) { memcpy(output.get
(), input.get
(), - input.shape.num_elements().unwrap_nonnegative() * - size_of_datatype(DT).unwrap_nonnegative()); + input.shape.num_elements().int_from_positive_int() * + size_of_datatype(DT).int_from_positive_int()); } }; @@ -18,11 +19,12 @@ template struct CPUBackwardKernel { void operator()(GenericTensorAccessorR const &output, GenericTensorAccessorW &input, - nonnegative_int num_elements, + positive_int num_elements, nonnegative_int num_replicas) { using T = real_type_t
; - for (nonnegative_int i : nonnegative_range(num_elements)) { + for (nonnegative_int i : + nonnegative_range(num_elements.nonnegative_int_from_positive_int())) { T cur_sum = 0; for (nonnegative_int replica_idx : nonnegative_range(num_replicas)) { cur_sum += output.at
(LegionOrdered{replica_idx, i}); @@ -40,7 +42,7 @@ void cpu_forward_kernel(GenericTensorAccessorR const &input, void cpu_backward_kernel(GenericTensorAccessorR const &output, GenericTensorAccessorW &input, size_t num_replicas) { - nonnegative_int num_elements = input.shape.num_elements(); + positive_int num_elements = input.shape.num_elements(); DataTypeDispatch1{}(input.data_type, output, input, diff --git a/lib/kernels/src/cpu/ops/reverse_kernels.cc b/lib/kernels/src/cpu/ops/reverse_kernels.cc index 4d9eb8cc09..212a52881a 100644 --- a/lib/kernels/src/cpu/ops/reverse_kernels.cc +++ b/lib/kernels/src/cpu/ops/reverse_kernels.cc @@ -9,7 +9,7 @@ struct CPUReverseForwardKernel { void operator()(GenericTensorAccessorR const &input, GenericTensorAccessorW &output, ReverseAttrs const &attrs) { - nonnegative_int reverse_axis_size = input.shape.at(attrs.axis); + positive_int reverse_axis_size = input.shape.at(attrs.axis); for (ArrayCoord const &input_coord : get_array_coord_set(input.shape)) { nonnegative_int input_reverse_axis_coord = @@ -17,7 +17,7 @@ struct CPUReverseForwardKernel { ArrayCoord output_coord = input_coord; output_coord.ff_ordered.at(attrs.axis) = - nonnegative_int{reverse_axis_size.unwrap_nonnegative() - + nonnegative_int{reverse_axis_size.int_from_positive_int() - input_reverse_axis_coord.unwrap_nonnegative() - 1}; output.at
(output_coord.ff_ordered) = diff --git a/lib/kernels/src/cuda/cuda_helper.cu b/lib/kernels/src/cuda/cuda_helper.cu index 86b2d8a437..98faadf5ac 100644 --- a/lib/kernels/src/cuda/cuda_helper.cu +++ b/lib/kernels/src/cuda/cuda_helper.cu @@ -224,10 +224,10 @@ ffStatus_t tensor, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, - shape.at_maybe(legion_dim_t{0_n}).value_or(1_n).unwrap_nonnegative(), - shape.at_maybe(legion_dim_t{1_n}).value_or(1_n).unwrap_nonnegative(), - shape.at_maybe(legion_dim_t{2_n}).value_or(1_n).unwrap_nonnegative(), - shape.at_maybe(legion_dim_t{3_n}).value_or(1_n).unwrap_nonnegative()); + shape.at_maybe(legion_dim_t{0_n}).value_or(1_p).int_from_positive_int(), + shape.at_maybe(legion_dim_t{1_n}).value_or(1_p).int_from_positive_int(), + shape.at_maybe(legion_dim_t{2_n}).value_or(1_p).int_from_positive_int(), + shape.at_maybe(legion_dim_t{3_n}).value_or(1_p).int_from_positive_int()); } cudnnDataType_t ff_to_cudnn_datatype(DataType type) { diff --git a/lib/kernels/src/cuda/embedding_kernels.cu b/lib/kernels/src/cuda/embedding_kernels.cu index cb84f0e777..a7e28c6297 100644 --- a/lib/kernels/src/cuda/embedding_kernels.cu +++ b/lib/kernels/src/cuda/embedding_kernels.cu @@ -343,7 +343,7 @@ struct ForwardKernel { int batch_size) { if (!aggr.has_value()) { embed_forward_no_aggr - <<>>(input.get(), @@ -354,7 +354,7 @@ struct ForwardKernel { } else { assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); embed_forward_with_aggr - <<>>(input.get(), @@ -380,7 +380,7 @@ struct ForwardKernel { int batch_size) { if (!aggr.has_value()) { embed_forward_no_aggr - <<>>(input.get(), @@ -391,7 +391,7 @@ struct ForwardKernel { } else { assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); embed_forward_with_aggr - <<>>(input.get(), @@ -417,7 +417,7 @@ struct ForwardKernel { int batch_size) { if (!aggr.has_value()) { embed_forward_no_aggr - <<>>(input.get(), @@ -428,7 +428,7 @@ struct ForwardKernel { } else { assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); embed_forward_with_aggr - <<>>(input.get(), @@ -454,7 +454,7 @@ struct ForwardKernel { int batch_size) { if (!aggr.has_value()) { embed_forward_no_aggr - <<>>(input.get(), @@ -465,7 +465,7 @@ struct ForwardKernel { } else { assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); embed_forward_with_aggr - <<>>(input.get(), @@ -491,7 +491,7 @@ struct ForwardKernel { int batch_size) { if (!aggr.has_value()) { embed_forward_no_aggr - <<>>(input.get(), @@ -502,7 +502,7 @@ struct ForwardKernel { } else { assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); embed_forward_with_aggr - <<>>(input.get(), @@ -528,7 +528,7 @@ struct ForwardKernel { int batch_size) { if (!aggr.has_value()) { embed_forward_no_aggr - <<>>(input.get(), @@ -539,7 +539,7 @@ struct ForwardKernel { } else { assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); embed_forward_with_aggr - <<>>(input.get(), @@ -580,7 +580,7 @@ struct BackwardKernel { int batch_size) { if (!aggr.has_value()) { embed_backward_no_aggr - <<>>(input.get(), @@ -590,7 +590,7 @@ struct BackwardKernel { batch_size); } else { embed_backward_with_aggr - <<>>(input.get(), @@ -616,7 +616,7 @@ struct BackwardKernel { int batch_size) { if (!aggr.has_value()) { embed_backward_no_aggr - <<>>(input.get(), @@ -626,7 +626,7 @@ struct BackwardKernel { batch_size); } else { embed_backward_with_aggr - <<>>(input.get(), @@ -652,7 +652,7 @@ struct BackwardKernel { int batch_size) { if (!aggr.has_value()) { embed_backward_no_aggr - <<>>(input.get(), @@ -662,7 +662,7 @@ struct BackwardKernel { batch_size); } else { embed_backward_with_aggr - <<>>(input.get(), @@ -688,7 +688,7 @@ struct BackwardKernel { int batch_size) { if (!aggr.has_value()) { embed_backward_no_aggr - <<>>(input.get(), @@ -698,7 +698,7 @@ struct BackwardKernel { batch_size); } else { embed_backward_with_aggr - <<>>(input.get(), @@ -724,7 +724,7 @@ struct BackwardKernel { int batch_size) { if (!aggr.has_value()) { embed_backward_no_aggr - <<>>(input.get(), @@ -734,7 +734,7 @@ struct BackwardKernel { batch_size); } else { embed_backward_with_aggr - <<>>(input.get(), @@ -760,7 +760,7 @@ struct BackwardKernel { int batch_size) { if (!aggr.has_value()) { embed_backward_no_aggr - <<>>(input.get(), @@ -770,7 +770,7 @@ struct BackwardKernel { batch_size); } else { embed_backward_with_aggr - <<>>(input.get(), diff --git a/lib/kernels/src/cuda/ops/cast_kernels.cu b/lib/kernels/src/cuda/ops/cast_kernels.cu index f3ea6db660..3de6de9d5e 100644 --- a/lib/kernels/src/cuda/ops/cast_kernels.cu +++ b/lib/kernels/src/cuda/ops/cast_kernels.cu @@ -41,7 +41,7 @@ struct ForwardKernel { void operator()(ffStream_t stream, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { - size_t volume = input.shape.get_volume().unwrap_nonnegative(); + size_t volume = input.shape.num_elements().int_from_positive_int(); cast_forward<<>>( input.get(), output.get(), volume); } @@ -52,7 +52,7 @@ struct BackwardKernel { void operator()(ffStream_t stream, GenericTensorAccessorR const &output, GenericTensorAccessorW const &input) { - size_t volume = output.shape.get_volume().unwrap_nonnegative(); + size_t volume = output.shape.num_elements().int_from_positive_int(); cast_backward<<>>( output.get(), input.get(), volume, cast_to(1.0f)); } diff --git a/lib/kernels/src/cuda/ops/combine_kernels.cu b/lib/kernels/src/cuda/ops/combine_kernels.cu index 08cc343fd2..4920696756 100644 --- a/lib/kernels/src/cuda/ops/combine_kernels.cu +++ b/lib/kernels/src/cuda/ops/combine_kernels.cu @@ -29,8 +29,8 @@ struct ForwardKernel { GenericTensorAccessorW const &output) { checkCUDA(cudaMemcpyAsync(output.get
(), input.get
(), - input.shape.get_volume().unwrap_nonnegative() * - size_of_datatype(DT).unwrap_nonnegative(), + input.shape.num_elements().int_from_positive_int() * + size_of_datatype(DT).int_from_positive_int(), cudaMemcpyDeviceToDevice, stream)); } @@ -41,7 +41,7 @@ struct BackwardKernel { void operator()(ffStream_t stream, GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &input_grad) { - size_t num_elements = output_grad.shape.get_volume().unwrap_nonnegative(); + size_t num_elements = output_grad.shape.num_elements().int_from_positive_int(); add_kernel> <<>>( input_grad.get
(), output_grad.get
(), num_elements); diff --git a/lib/kernels/src/cuda/ops/concat_kernels.cu b/lib/kernels/src/cuda/ops/concat_kernels.cu index 37dbbe12f8..e7f88bc258 100644 --- a/lib/kernels/src/cuda/ops/concat_kernels.cu +++ b/lib/kernels/src/cuda/ops/concat_kernels.cu @@ -30,10 +30,10 @@ void calc_blk_size(size_t &num_blocks, } blk_size = shape.sub_shape(legion_dim_t{0_n}, legion_axis) .num_elements() - .unwrap_nonnegative(); + .int_from_positive_int(); num_blocks = shape.sub_shape(legion_axis, std::nullopt) .num_elements() - .unwrap_nonnegative(); + .int_from_positive_int(); } void forward_kernel(cudaStream_t stream, diff --git a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu index 16db62a57f..6e446008ed 100644 --- a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu +++ b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu @@ -137,15 +137,15 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, ffConvolutionBwdFilterAlgo_t bwdFilterAlgo; ffConvolutionBwdDataAlgo_t bwdDataAlgo; - int input_w = input.shape.at(legion_dim_t(0_n)).unwrap_nonnegative(); - int input_h = input.shape.at(legion_dim_t(1_n)).unwrap_nonnegative(); - int input_c = input.shape.at(legion_dim_t(2_n)).unwrap_nonnegative(); - int input_n = input.shape.at(legion_dim_t(3_n)).unwrap_nonnegative(); + int input_w = input.shape.at(legion_dim_t(0_n)).int_from_positive_int(); + int input_h = input.shape.at(legion_dim_t(1_n)).int_from_positive_int(); + int input_c = input.shape.at(legion_dim_t(2_n)).int_from_positive_int(); + int input_n = input.shape.at(legion_dim_t(3_n)).int_from_positive_int(); - int output_w = output.shape.at(legion_dim_t(0_n)).unwrap_nonnegative(); - int output_h = output.shape.at(legion_dim_t(1_n)).unwrap_nonnegative(); - int output_c = output.shape.at(legion_dim_t(2_n)).unwrap_nonnegative(); - int output_n = output.shape.at(legion_dim_t(3_n)).unwrap_nonnegative(); + int output_w = output.shape.at(legion_dim_t(0_n)).int_from_positive_int(); + int output_h = output.shape.at(legion_dim_t(1_n)).int_from_positive_int(); + int output_c = output.shape.at(legion_dim_t(2_n)).int_from_positive_int(); + int output_n = output.shape.at(legion_dim_t(3_n)).int_from_positive_int(); checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&biasTensor)); diff --git a/lib/kernels/src/cuda/ops/element_unary_kernels.cu b/lib/kernels/src/cuda/ops/element_unary_kernels.cu index 218e74b939..21ac95c204 100644 --- a/lib/kernels/src/cuda/ops/element_unary_kernels.cu +++ b/lib/kernels/src/cuda/ops/element_unary_kernels.cu @@ -266,7 +266,7 @@ struct ForwardKernel { output.get())); } else if (use_scalar(op_type)) { assert(scalar.has_value()); - size_t num_elements = input.shape.num_elements().unwrap_nonnegative(); + size_t num_elements = input.shape.num_elements().int_from_positive_int(); elewise_scalar_unary_forward_kernel> <<>>( num_elements, @@ -275,7 +275,7 @@ struct ForwardKernel { input.get(), output.get()); } else { - size_t num_elements = input.shape.num_elements().unwrap_nonnegative(); + size_t num_elements = input.shape.num_elements().int_from_positive_int(); elewise_unary_forward_kernel> <<>>( num_elements, op_type, input.get(), output.get()); @@ -312,7 +312,7 @@ struct BackwardKernel { input_grad.get())); } else if (use_scalar(op_type)) { assert(scalar.has_value()); - size_t num_elements = input.shape.num_elements().unwrap_nonnegative(); + size_t num_elements = input.shape.num_elements().int_from_positive_int(); elewise_scalar_unary_backward_kernel> <<>>( num_elements, @@ -323,7 +323,7 @@ struct BackwardKernel { input.get(), input_grad.get()); } else { - size_t num_elements = input.shape.num_elements().unwrap_nonnegative(); + size_t num_elements = input.shape.num_elements().int_from_positive_int(); elewise_unary_backward_kernel> <<>>( num_elements, diff --git a/lib/kernels/src/cuda/ops/flat_kernels.cu b/lib/kernels/src/cuda/ops/flat_kernels.cu index 594a183ff0..9dee095071 100644 --- a/lib/kernels/src/cuda/ops/flat_kernels.cu +++ b/lib/kernels/src/cuda/ops/flat_kernels.cu @@ -27,7 +27,7 @@ void forward_kernel(cudaStream_t stream, checkCUDA(cudaMemcpyAsync(output_ptr, input.get_float_ptr(), - input.shape.num_elements().unwrap_nonnegative() * + input.shape.num_elements().int_from_positive_int() * sizeof(float), cudaMemcpyDeviceToDevice, stream)); @@ -40,12 +40,12 @@ void backward_kernel(cudaStream_t stream, float alpha = 1.0f; apply_add_with_scale - <<>>(input_grad_ptr, output_grad_ptr, - input.shape.num_elements().unwrap_nonnegative(), + input.shape.num_elements().int_from_positive_int(), alpha); } diff --git a/lib/kernels/src/cuda/ops/gather_kernels.cu b/lib/kernels/src/cuda/ops/gather_kernels.cu index 19e495a540..bee8f68eef 100644 --- a/lib/kernels/src/cuda/ops/gather_kernels.cu +++ b/lib/kernels/src/cuda/ops/gather_kernels.cu @@ -127,13 +127,13 @@ void forward_kernel(ffStream_t stream, output.shape .sub_shape(legion_dim_t{0_n}, add_to_legion_dim(m.legion_dim, 1)) .num_elements() - .unwrap_nonnegative(); + .int_from_positive_int(); if (m.legion_dim.value == 0_n) { stride = 1; } - coord_t output_dim_size = output.shape.at(m.legion_dim).unwrap_nonnegative(); - coord_t input_dim_size = input.shape.at(m.legion_dim).unwrap_nonnegative(); + coord_t output_dim_size = output.shape.at(m.legion_dim).int_from_positive_int(); + coord_t input_dim_size = input.shape.at(m.legion_dim).int_from_positive_int(); assert(index.data_type == DataType::INT32 || index.data_type == DataType::INT64); @@ -144,7 +144,7 @@ void forward_kernel(ffStream_t stream, input, index, output, - output.shape.get_volume().unwrap_nonnegative(), + output.shape.num_elements().int_from_positive_int(), stride, input_dim_size, output_dim_size); @@ -161,15 +161,15 @@ void backward_kernel(ffStream_t stream, output_grad.shape .sub_shape(legion_dim_t{0_n}, add_to_legion_dim(m.legion_dim, 1)) .num_elements() - .unwrap_nonnegative(); + .int_from_positive_int(); if (m.legion_dim.value == 0_n) { stride = 1; } coord_t output_dim_size = - output_grad.shape.at(m.legion_dim).unwrap_nonnegative(); + output_grad.shape.at(m.legion_dim).int_from_positive_int(); coord_t input_dim_size = - input_grad.shape.at(m.legion_dim).unwrap_nonnegative(); + input_grad.shape.at(m.legion_dim).int_from_positive_int(); assert(index.data_type == DataType::INT32 || index.data_type == DataType::INT64); @@ -180,7 +180,7 @@ void backward_kernel(ffStream_t stream, output_grad, index, input_grad, - output_grad.shape.get_volume().unwrap_nonnegative(), + output_grad.shape.num_elements().int_from_positive_int(), stride, input_dim_size, output_dim_size); diff --git a/lib/kernels/src/cuda/ops/partition_kernels.cu b/lib/kernels/src/cuda/ops/partition_kernels.cu index b8dfac5204..e4a83a12c8 100644 --- a/lib/kernels/src/cuda/ops/partition_kernels.cu +++ b/lib/kernels/src/cuda/ops/partition_kernels.cu @@ -29,8 +29,8 @@ struct ForwardKernel { GenericTensorAccessorW const &output) { checkCUDA(cudaMemcpyAsync(output.get(), input.get(), - input.shape.num_elements().unwrap_nonnegative() * - size_of_datatype(T).unwrap_nonnegative(), + input.shape.num_elements().int_from_positive_int() * + size_of_datatype(T).int_from_positive_int(), cudaMemcpyDeviceToDevice, stream)); } @@ -43,12 +43,12 @@ struct BackwardKernel { GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &input_grad) { add_kernel> - <<>>(input_grad.get(), output_grad.get(), - input_grad.shape.num_elements().unwrap_nonnegative()); + input_grad.shape.num_elements().int_from_positive_int()); } }; diff --git a/lib/kernels/src/cuda/ops/reduction_kernels.cu b/lib/kernels/src/cuda/ops/reduction_kernels.cu index d9c09b082d..ac3b7c9b08 100644 --- a/lib/kernels/src/cuda/ops/reduction_kernels.cu +++ b/lib/kernels/src/cuda/ops/reduction_kernels.cu @@ -42,12 +42,12 @@ struct ForwardKernel { size_t num_replicas) { size_t total_elements = - input.shape.num_elements().unwrap_nonnegative() * num_replicas; + input.shape.num_elements().int_from_positive_int() * num_replicas; reduction_forward_kernel> <<>>( input.get(), output.get(), - input.shape.num_elements().unwrap_nonnegative(), + input.shape.num_elements().int_from_positive_int(), num_replicas); } }; @@ -59,8 +59,8 @@ struct BackwardKernel { GenericTensorAccessorW const &input) { checkCUDA(cudaMemcpyAsync(input.get(), output.get(), - input.shape.num_elements().unwrap_nonnegative() * - size_of_datatype(T).unwrap_nonnegative(), + input.shape.num_elements().int_from_positive_int() * + size_of_datatype(T).int_from_positive_int(), cudaMemcpyDeviceToDevice, stream)); } diff --git a/lib/kernels/src/cuda/ops/replicate_kernels.cu b/lib/kernels/src/cuda/ops/replicate_kernels.cu index 4685fd7a2d..23e65cc1f3 100644 --- a/lib/kernels/src/cuda/ops/replicate_kernels.cu +++ b/lib/kernels/src/cuda/ops/replicate_kernels.cu @@ -40,8 +40,8 @@ struct ForwardKernel { GenericTensorAccessorW const &output) { checkCUDA(cudaMemcpyAsync((void *)output.get(), (void *)input.get(), - input.shape.num_elements().unwrap_nonnegative() * - size_of_datatype(T).unwrap_nonnegative(), + input.shape.num_elements().int_from_positive_int() * + size_of_datatype(T).int_from_positive_int(), cudaMemcpyDeviceToDevice, stream)); } @@ -54,12 +54,12 @@ struct BackwardKernel { GenericTensorAccessorW const &input, size_t num_replicas) { size_t total_elements = - input.shape.num_elements().unwrap_nonnegative() * num_replicas; + input.shape.num_elements().int_from_positive_int() * num_replicas; replicate_backward_kernel> <<>>( output.get(), input.get(), - input.shape.num_elements().unwrap_nonnegative(), + input.shape.num_elements().int_from_positive_int(), num_replicas); } }; diff --git a/lib/kernels/src/cuda/ops/reshape_kernels.cu b/lib/kernels/src/cuda/ops/reshape_kernels.cu index a6a390b38e..06aa8d74b2 100644 --- a/lib/kernels/src/cuda/ops/reshape_kernels.cu +++ b/lib/kernels/src/cuda/ops/reshape_kernels.cu @@ -33,8 +33,8 @@ struct ForwardKernel { GenericTensorAccessorW const &output) { checkCUDA(cudaMemcpyAsync(output.get(), input.get(), - input.shape.num_elements().unwrap_nonnegative() * - size_of_datatype(T).unwrap_nonnegative(), + input.shape.num_elements().int_from_positive_int() * + size_of_datatype(T).int_from_positive_int(), cudaMemcpyDeviceToDevice, stream)); } @@ -47,12 +47,12 @@ struct BackwardKernel { GenericTensorAccessorW const &input) { float alpha = 1.0f; apply_add_with_scale> - <<>>(input.get(), output.get(), - input.shape.num_elements().unwrap_nonnegative(), + input.shape.num_elements().int_from_positive_int(), static_cast>(alpha)); } }; diff --git a/lib/kernels/src/cuda/ops/reverse_kernels.cu b/lib/kernels/src/cuda/ops/reverse_kernels.cu index 582aa02386..c63be7f9b4 100644 --- a/lib/kernels/src/cuda/ops/reverse_kernels.cu +++ b/lib/kernels/src/cuda/ops/reverse_kernels.cu @@ -63,10 +63,10 @@ void forward_kernel(ffStream_t stream, stream, input_accessor.get_float_ptr(), output_accessor.get_float_ptr(), - reverse_kernels_params.num_out_blks.unwrap_nonnegative(), - reverse_kernels_params.reverse_dim_size.unwrap_nonnegative(), - reverse_kernels_params.in_blk_size.unwrap_nonnegative(), - reverse_kernels_params.out_size.unwrap_nonnegative()); + reverse_kernels_params.num_out_blks.int_from_positive_int(), + reverse_kernels_params.reverse_dim_size.int_from_positive_int(), + reverse_kernels_params.in_blk_size.int_from_positive_int(), + reverse_kernels_params.out_size.int_from_positive_int()); } void backward_kernel_internal(cudaStream_t stream, @@ -95,10 +95,10 @@ void backward_kernel(ffStream_t stream, stream, output_grad_accessor.get_float_ptr(), input_grad_accessor.get_float_ptr(), - reverse_kernels_params.num_out_blks.unwrap_nonnegative(), - reverse_kernels_params.reverse_dim_size.unwrap_nonnegative(), - reverse_kernels_params.in_blk_size.unwrap_nonnegative(), - reverse_kernels_params.out_size.unwrap_nonnegative()); + reverse_kernels_params.num_out_blks.int_from_positive_int(), + reverse_kernels_params.reverse_dim_size.int_from_positive_int(), + reverse_kernels_params.in_blk_size.int_from_positive_int(), + reverse_kernels_params.out_size.int_from_positive_int()); } } // namespace FlexFlow::Kernels::Reverse diff --git a/lib/kernels/src/cuda/ops/transpose_kernels.cu b/lib/kernels/src/cuda/ops/transpose_kernels.cu index 91f3d48a35..13162a9888 100644 --- a/lib/kernels/src/cuda/ops/transpose_kernels.cu +++ b/lib/kernels/src/cuda/ops/transpose_kernels.cu @@ -77,9 +77,9 @@ void forward_kernel(cudaStream_t stream, info.out_strides[i] = 1; } else { int in_dim_size = - input.shape.at(legion_dim_t{nonnegative_int{i}}).unwrap_nonnegative(); + input.shape.at(legion_dim_t{nonnegative_int{i}}).int_from_positive_int(); int out_dim_size = output.shape.at(legion_dim_t{nonnegative_int{i}}) - .unwrap_nonnegative(); + .int_from_positive_int(); info.in_strides[i] = info.in_strides[i - 1] * in_dim_size; info.out_strides[i] = info.out_strides[i - 1] * out_dim_size; } @@ -88,10 +88,10 @@ void forward_kernel(cudaStream_t stream, .value.unwrap_nonnegative(); } transpose_simple_kernel<<< - GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()), + GET_BLOCKS(output.shape.num_elements().int_from_positive_int()), CUDA_NUM_THREADS, 0, - stream>>>(output.shape.get_volume().unwrap_nonnegative(), + stream>>>(output.shape.num_elements().int_from_positive_int(), input.get_float_ptr(), output.get_float_ptr(), info, @@ -116,9 +116,9 @@ void backward_kernel(cudaStream_t stream, info.out_strides[i] = 1; } else { int in_dim_size = out_grad.shape.at(legion_dim_t{nonnegative_int{i}}) - .unwrap_nonnegative(); + .int_from_positive_int(); int out_dim_size = in_grad.shape.at(legion_dim_t{nonnegative_int{i}}) - .unwrap_nonnegative(); + .int_from_positive_int(); info.in_strides[i] = info.in_strides[i - 1] * in_dim_size; info.out_strides[i] = info.out_strides[i - 1] * out_dim_size; } @@ -126,10 +126,10 @@ void backward_kernel(cudaStream_t stream, .value.unwrap_nonnegative()] = i; } transpose_simple_kernel<<< - GET_BLOCKS(in_grad.shape.get_volume().unwrap_nonnegative()), + GET_BLOCKS(in_grad.shape.num_elements().int_from_positive_int()), CUDA_NUM_THREADS, 0, - stream>>>(in_grad.shape.get_volume().unwrap_nonnegative(), + stream>>>(in_grad.shape.num_elements().int_from_positive_int(), out_grad.get_float_ptr(), in_grad.get_float_ptr(), info, diff --git a/lib/kernels/src/cuda/optimizer_kernels.cu b/lib/kernels/src/cuda/optimizer_kernels.cu index fe817876ce..e1ab7eb92c 100644 --- a/lib/kernels/src/cuda/optimizer_kernels.cu +++ b/lib/kernels/src/cuda/optimizer_kernels.cu @@ -167,7 +167,7 @@ __host__ void adam_ps_update_task_gpu(ffStream_t stream, } #ifdef FF_USE_NCCL -__host__ void nccl_update_task_gpu(ffStream_t stream, +__host__ void adam_nccl_update_task_gpu(ffStream_t stream, float alpha_t, float beta1, float beta2, diff --git a/lib/kernels/src/kernels/accessor.cc b/lib/kernels/src/kernels/accessor.cc index 409b7533f9..46137c3c9c 100644 --- a/lib/kernels/src/kernels/accessor.cc +++ b/lib/kernels/src/kernels/accessor.cc @@ -15,7 +15,7 @@ nonnegative_int "Number of indices does not match the number of dimensions"); nonnegative_int offset = 0_n; - nonnegative_int multiplier = 1_n; + positive_int multiplier = 1_p; for (legion_dim_t dim : reversed(vector_of(key_range(shape.dims)))) { ASSERT(indices.at(dim) < shape.at(legion_dim_t{dim}), @@ -33,8 +33,8 @@ void copy_accessor_data_to_l_from_r( GenericTensorAccessorW &dst_accessor, GenericTensorAccessorR const &src_accessor) { size_t num_bytes = - dst_accessor.shape.get_volume().unwrap_nonnegative() * - size_of_datatype(dst_accessor.data_type).unwrap_nonnegative(); + dst_accessor.shape.num_elements().int_from_positive_int() * + size_of_datatype(dst_accessor.data_type).int_from_positive_int(); DeviceType dst_device_type = dst_accessor.device_type; DeviceType src_device_type = src_accessor.device_type; @@ -221,12 +221,60 @@ std::vector return get(a); } +int32_t *get_int32_ptr(GenericTensorAccessorW const &a) { + return get(a); +} + +int64_t *get_int64_ptr(GenericTensorAccessorW const &a) { + return get(a); +} + +float *get_float_ptr(GenericTensorAccessorW const &a) { + return get(a); +} + +double *get_double_ptr(GenericTensorAccessorW const &a) { + return get(a); +} + +half *get_half_ptr(GenericTensorAccessorW const &a) { + return get(a); +} + +std::vector + get_int32_ptrs(std::vector const &a) { + return get(a); +} + +std::vector + get_int64_ptrs(std::vector const &a) { + return get(a); +} + +std::vector + get_float_ptrs(std::vector const &a) { + return get(a); +} + +std::vector + get_double_ptrs(std::vector const &a) { + return get(a); +} + +std::vector + get_half_ptrs(std::vector const &a) { + return get(a); +} + + GenericTensorAccessorR read_only_accessor_from_write_accessor( GenericTensorAccessorW const &writable) { - return GenericTensorAccessorR{writable.data_type, - writable.shape, - req(writable.ptr), - writable.device_type}; + return GenericTensorAccessorR{ + writable.data_type, + writable.shape, + writable.ptr, + writable.device_type, + }; } bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1, @@ -263,4 +311,7 @@ std::pair return std::make_pair(accessor.shape, accessor.data_type); } +template + int32_t accessor_get_only_value(GenericTensorAccessorR const &); + } // namespace FlexFlow diff --git a/lib/kernels/src/kernels/allocation.cc b/lib/kernels/src/kernels/allocation.cc index b9f253bcff..a6881d240a 100644 --- a/lib/kernels/src/kernels/allocation.cc +++ b/lib/kernels/src/kernels/allocation.cc @@ -18,7 +18,7 @@ DeviceType Allocator::get_allocation_device_type() const { GenericTensorAccessorW Allocator::allocate_tensor(TensorShape const &tensor_shape) { void *ptr = - this->allocate(get_size_in_bytes(tensor_shape).unwrap_nonnegative()); + this->allocate(get_size_in_bytes(tensor_shape).int_from_positive_int()); return GenericTensorAccessorW{ tensor_shape.data_type, array_shape_from_tensor_shape(tensor_shape), diff --git a/lib/kernels/src/kernels/array_shape.cc b/lib/kernels/src/kernels/array_shape.cc index 34a53c1bb3..18b8861164 100644 --- a/lib/kernels/src/kernels/array_shape.cc +++ b/lib/kernels/src/kernels/array_shape.cc @@ -11,40 +11,31 @@ #include "utils/hash/tuple.h" #include "utils/hash/vector.h" #include "utils/nonnegative_int/num_elements.h" +#include "op-attrs/ff_ordered/get_idxs.h" +#include "utils/nonnegative_int/nonnegative_range.h" namespace FlexFlow { -ArrayShape::ArrayShape(LegionOrdered const &input_dims) +ArrayShape::ArrayShape(LegionOrdered const &input_dims) : dims(input_dims) {} -nonnegative_int ArrayShape::get_volume() const { - return this->num_elements(); -} - nonnegative_int ArrayShape::num_dims() const { return ::FlexFlow::num_elements(this->dims); } -nonnegative_int ArrayShape::get_dim() const { - return this->num_dims(); -} - -nonnegative_int ArrayShape::num_elements() const { - if (dims.size() == 0) { - return 0_n; - } +positive_int ArrayShape::num_elements() const { return product(this->dims); } -nonnegative_int ArrayShape::operator[](legion_dim_t idx) const { +positive_int ArrayShape::operator[](legion_dim_t idx) const { return dims.at(idx); } -nonnegative_int ArrayShape::at(legion_dim_t idx) const { +positive_int ArrayShape::at(legion_dim_t idx) const { return dims.at(idx); } -nonnegative_int ArrayShape::at(ff_dim_t idx) const { +positive_int ArrayShape::at(ff_dim_t idx) const { return dims.at(legion_dim_from_ff_dim(idx, this->num_dims())); } @@ -59,9 +50,9 @@ bool ArrayShape::operator!=(ArrayShape const &other) const { ArrayShape ArrayShape::sub_shape(ff_dim_t const &start, std::optional const &maybe_end) const { - FFOrdered ff_ordered_dims = + FFOrdered ff_ordered_dims = ff_ordered_from_legion_ordered(this->dims); - FFOrdered sliced = slice(ff_ordered_dims, start, maybe_end); + FFOrdered sliced = slice(ff_ordered_dims, start, maybe_end); return ArrayShape{legion_ordered_from_ff_ordered(sliced)}; } @@ -71,7 +62,7 @@ ArrayShape return ArrayShape{slice(this->dims, start, maybe_end)}; } -std::optional ArrayShape::at_maybe(legion_dim_t index) const { +std::optional ArrayShape::at_maybe(legion_dim_t index) const { if (index.value < dims.size()) { return dims.at(index); } else { @@ -79,11 +70,11 @@ std::optional ArrayShape::at_maybe(legion_dim_t index) const { } } -std::optional ArrayShape::at_maybe(ff_dim_t index) const { +std::optional ArrayShape::at_maybe(ff_dim_t index) const { return this->at_maybe(legion_dim_from_ff_dim(index, this->num_dims())); } -std::tuple const &> ArrayShape::tie() const { +std::tuple const &> ArrayShape::tie() const { return std::tie(this->dims); } @@ -99,8 +90,8 @@ std::ostream &operator<<(std::ostream &s, ArrayShape const &x) { return (s << fmt::to_string(x)); } -nonnegative_int get_volume(ArrayShape const &shape) { - return shape.get_volume(); +positive_int get_num_elements(ArrayShape const &shape) { + return shape.num_elements(); } ArrayShape array_shape_from_tensor_shape(TensorShape const &tensor_shape) { @@ -113,11 +104,15 @@ TensorShape get_tensor_shape(ArrayShape const &shape, DataType dtype) { dtype}; } +std::unordered_set get_ff_dim_t_set(ArrayShape const &shape) { + return unordered_set_of(get_idxs(ff_ordered_from_legion_ordered(shape.dims))); +} + std::unordered_set get_array_coord_set(ArrayShape const &shape) { std::vector> per_dim_ranges = transform(vector_of(ff_ordered_from_legion_ordered(shape.dims)), - [](nonnegative_int dim_size) -> std::vector { - return nonnegative_range(dim_size); + [](positive_int dim_size) -> std::vector { + return nonnegative_range(dim_size.nonnegative_int_from_positive_int()); }); std::unordered_set> raw_points = @@ -129,6 +124,18 @@ std::unordered_set get_array_coord_set(ArrayShape const &shape) { }); } +ArrayShape array_shape_drop_dims(ArrayShape const &shape, + std::function const &should_drop_dim) { + std::vector result; + for (ff_dim_t idx : get_idxs(ff_ordered_from_legion_ordered(shape.dims))) { + if (!should_drop_dim(idx)) { + result.push_back(shape.at(idx)); + } + } + + return ArrayShape{legion_ordered_from_ff_ordered(ff_ordered_of(result))}; +} + } // namespace FlexFlow namespace std { diff --git a/lib/kernels/src/kernels/compare_tensor_accessors.cc b/lib/kernels/src/kernels/compare_tensor_accessors.cc index 4594fed322..b1f5fd39b7 100644 --- a/lib/kernels/src/kernels/compare_tensor_accessors.cc +++ b/lib/kernels/src/kernels/compare_tensor_accessors.cc @@ -6,45 +6,57 @@ namespace FlexFlow { GenericTensorAccessorW compare_tensor_accessors_lt(GenericTensorAccessorR const &lhs, GenericTensorAccessorR const &rhs, Allocator &output_allocator) { - return map_tensor_accessors2(lhs, rhs, output_allocator, - [](auto const &l, auto const &r) { return l < r; }); + return map_tensor_accessors2(lhs, rhs, + DataType::BOOL, + [](auto const &l, auto const &r) { return l < r; }, + output_allocator); } GenericTensorAccessorW compare_tensor_accessors_le(GenericTensorAccessorR const &lhs, GenericTensorAccessorR const &rhs, Allocator &output_allocator) { - return map_tensor_accessors2(lhs, rhs, output_allocator, - [](auto const &l, auto const &r) { return l <= r; }); + return map_tensor_accessors2(lhs, rhs, + DataType::BOOL, + [](auto const &l, auto const &r) { return l <= r; }, + output_allocator); } GenericTensorAccessorW compare_tensor_accessors_gt(GenericTensorAccessorR const &lhs, GenericTensorAccessorR const &rhs, Allocator &output_allocator) { - return map_tensor_accessors2(lhs, rhs, output_allocator, - [](auto const &l, auto const &r) { return l > r; }); + return map_tensor_accessors2(lhs, rhs, + DataType::BOOL, + [](auto const &l, auto const &r) { return l > r; }, + output_allocator); } GenericTensorAccessorW compare_tensor_accessors_ge(GenericTensorAccessorR const &lhs, GenericTensorAccessorR const &rhs, Allocator &output_allocator) { - return map_tensor_accessors2(lhs, rhs, output_allocator, - [](auto const &l, auto const &r) { return l >= r; }); + return map_tensor_accessors2(lhs, rhs, + DataType::BOOL, + [](auto const &l, auto const &r) { return l >= r; }, + output_allocator); } GenericTensorAccessorW compare_tensor_accessors_eq(GenericTensorAccessorR const &lhs, GenericTensorAccessorR const &rhs, Allocator &output_allocator) { - return map_tensor_accessors2(lhs, rhs, output_allocator, - [](auto const &l, auto const &r) { return l == r; }); + return map_tensor_accessors2(lhs, rhs, + DataType::BOOL, + [](auto const &l, auto const &r) { return l == r; }, + output_allocator); } GenericTensorAccessorW compare_tensor_accessors_ne(GenericTensorAccessorR const &lhs, GenericTensorAccessorR const &rhs, Allocator &output_allocator) { - return map_tensor_accessors2(lhs, rhs, output_allocator, - [](auto const &l, auto const &r) { return l != r; }); + return map_tensor_accessors2(lhs, rhs, + DataType::BOOL, + [](auto const &l, auto const &r) { return l != r; }, + output_allocator); } } // namespace FlexFlow diff --git a/lib/kernels/src/kernels/fill_tensor_accessor.cc b/lib/kernels/src/kernels/fill_tensor_accessor.cc new file mode 100644 index 0000000000..f173bd0860 --- /dev/null +++ b/lib/kernels/src/kernels/fill_tensor_accessor.cc @@ -0,0 +1,26 @@ +#include "kernels/fill_tensor_accessor.h" +#include "op-attrs/datatype_value.h" + +namespace FlexFlow { + +void fill_tensor_accessor(GenericTensorAccessorW &accessor, DataTypeValue val) { + ASSERT(accessor.device_type == DeviceType::CPU); + ASSERT(accessor.data_type == get_data_type_of_data_type_value(val)); + +} + +GenericTensorAccessorW create_accessor_w_filled_with(TensorShape const &shape, + DataTypeValue val, + Allocator const &allocator) { + NOT_IMPLEMENTED(); +} + +GenericTensorAccessorR create_accessor_r_filled_with(TensorShape const &shape, + DataTypeValue val, + Allocator const &allocator) { + return read_only_accessor_from_write_accessor( + create_accessor_w_filled_with(shape, val, allocator)); +} + + +} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/format_accessor_contents.cc b/lib/kernels/src/kernels/format_accessor_contents.cc index 1b8ab35d89..3d24483967 100644 --- a/lib/kernels/src/kernels/format_accessor_contents.cc +++ b/lib/kernels/src/kernels/format_accessor_contents.cc @@ -4,6 +4,7 @@ #include "kernels/local_cpu_allocator.h" #include "utils/indent.h" #include +#include "utils/nonnegative_int/nonnegative_range.h" namespace FlexFlow { @@ -15,10 +16,10 @@ struct Print1DCPUAccessorR { nonnegative_int dims = accessor.shape.num_dims(); ASSERT(dims == 1_n); - nonnegative_int ncols = accessor.shape.at(ff_dim_t{0_n}); + positive_int ncols = accessor.shape.at(ff_dim_t{0_n}); stream << "[" - << join_strings(nonnegative_range(ncols), + << join_strings(nonnegative_range(ncols.nonnegative_int_from_positive_int()), " ", [&](nonnegative_int col_idx) -> std::string { return fmt::to_string( @@ -45,12 +46,12 @@ struct Print2DCPUAccessorR { ASSERT(accessor.device_type == DeviceType::CPU); nonnegative_int dims = accessor.shape.num_dims(); ASSERT(dims == 2_n); - nonnegative_int dim0_size = accessor.shape.at(ff_dim_t{0_n}); - nonnegative_int dim1_size = accessor.shape.at(ff_dim_t{1_n}); + positive_int dim0_size = accessor.shape.at(ff_dim_t{0_n}); + positive_int dim1_size = accessor.shape.at(ff_dim_t{1_n}); auto render_1d = [&](nonnegative_int dim0_idx) -> std::string { return "[" + - join_strings(nonnegative_range(dim1_size), + join_strings(nonnegative_range(dim1_size.nonnegative_int_from_positive_int()), " ", [&](nonnegative_int dim1_idx) -> std::string { return fmt::to_string( @@ -61,7 +62,7 @@ struct Print2DCPUAccessorR { stream << "[\n" << indent( - join_strings(nonnegative_range(dim0_size), "\n", render_1d)) + join_strings(nonnegative_range(dim0_size.nonnegative_int_from_positive_int()), "\n", render_1d)) << "\n]"; } }; @@ -84,14 +85,14 @@ struct Print3DCPUAccessorR { nonnegative_int dims = accessor.shape.num_dims(); ASSERT(dims == 3_n); - nonnegative_int dim0_size = accessor.shape.at(ff_dim_t{0_n}); - nonnegative_int dim1_size = accessor.shape.at(ff_dim_t{1_n}); - nonnegative_int dim2_size = accessor.shape.at(ff_dim_t{2_n}); + positive_int dim0_size = accessor.shape.at(ff_dim_t{0_n}); + positive_int dim1_size = accessor.shape.at(ff_dim_t{1_n}); + positive_int dim2_size = accessor.shape.at(ff_dim_t{2_n}); auto render_1d = [&](nonnegative_int dim0_idx, nonnegative_int dim1_idx) -> std::string { return "[" + - join_strings(nonnegative_range(dim2_size), + join_strings(nonnegative_range(dim2_size.nonnegative_int_from_positive_int()), " ", [&](nonnegative_int dim2_idx) -> std::string { return fmt::to_string(accessor.at
( @@ -102,7 +103,7 @@ struct Print3DCPUAccessorR { auto render_2d = [&](nonnegative_int dim0_idx) -> std::string { return "[\n" + - indent(join_strings(nonnegative_range(dim1_size), + indent(join_strings(nonnegative_range(dim1_size.nonnegative_int_from_positive_int()), "\n", [&](nonnegative_int dim1_idx) -> std::string { return render_1d(dim0_idx, dim1_idx); @@ -112,7 +113,7 @@ struct Print3DCPUAccessorR { stream << "[\n" << indent( - join_strings(nonnegative_range(dim0_size), "\n", render_2d)) + join_strings(nonnegative_range(dim0_size.nonnegative_int_from_positive_int()), "\n", render_2d)) << "\n]"; } }; diff --git a/lib/kernels/src/kernels/map_tensor_accessors.cc b/lib/kernels/src/kernels/map_tensor_accessors.cc index 619f1cc412..c59d2207d0 100644 --- a/lib/kernels/src/kernels/map_tensor_accessors.cc +++ b/lib/kernels/src/kernels/map_tensor_accessors.cc @@ -9,18 +9,19 @@ struct F1 { template GenericTensorAccessorW map_tensor_accessor(GenericTensorAccessorR const &, - Allocator &, - F1 &&); + F1 &&, + Allocator &); struct F2 { - template - float operator()(T const &lhs, T const &rhs) const { NOT_IMPLEMENTED(); } + template + float operator()(T1 const &lhs, T2 const &rhs) const { NOT_IMPLEMENTED(); } }; template GenericTensorAccessorW map_tensor_accessors2(GenericTensorAccessorR const &, GenericTensorAccessorR const &, - Allocator &, - F2 &&); + DataType, + F2 &&, + Allocator &); } // namespace FlexFlow diff --git a/lib/kernels/src/kernels/reduce_tensor_accessor.cc b/lib/kernels/src/kernels/reduce_tensor_accessor.cc new file mode 100644 index 0000000000..b9c4cee085 --- /dev/null +++ b/lib/kernels/src/kernels/reduce_tensor_accessor.cc @@ -0,0 +1,17 @@ +#include "kernels/reduce_tensor_accessor.h" + +namespace FlexFlow { + +using F = std::function; + +template + GenericTensorAccessorW reduce_tensor_accessor_in_dims( + GenericTensorAccessorR const &, + std::unordered_set const &, + Allocator &, + F &&); + +template + int32_t reduce_tensor_accessor_in_all_dims(GenericTensorAccessorR const &, F &&); + +} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/reverse_kernels_params.cc b/lib/kernels/src/kernels/reverse_kernels_params.cc index c647181872..0ad1a5ed20 100644 --- a/lib/kernels/src/kernels/reverse_kernels_params.cc +++ b/lib/kernels/src/kernels/reverse_kernels_params.cc @@ -6,10 +6,10 @@ ReverseKernelsParams compute_reverse_kernels_params(ArrayShape const &output_shape, ReverseAttrs const &attrs) { auto axis = attrs.axis; - nonnegative_int in_blk_size = 1_n; - nonnegative_int reverse_dim_size = 1_n; - nonnegative_int num_out_blks = 1_n; - for (nonnegative_int i : nonnegative_range(output_shape.get_dim())) { + positive_int in_blk_size = 1_p; + positive_int reverse_dim_size = 1_p; + positive_int num_out_blks = 1_p; + for (nonnegative_int i : nonnegative_range(output_shape.num_dims())) { if (i < axis.value) { in_blk_size *= output_shape.at(ff_dim_t{i}); } else if (i == axis.value) { @@ -23,7 +23,7 @@ ReverseKernelsParams num_out_blks, reverse_dim_size, in_blk_size, - output_shape.get_volume(), + output_shape.num_elements(), }; } diff --git a/lib/kernels/src/kernels/tensor_accessor_reductions.cc b/lib/kernels/src/kernels/tensor_accessor_reductions.cc new file mode 100644 index 0000000000..baeb9fadc1 --- /dev/null +++ b/lib/kernels/src/kernels/tensor_accessor_reductions.cc @@ -0,0 +1,27 @@ +#include "kernels/tensor_accessor_reductions.h" +#include "kernels/reduce_tensor_accessor.h" +#include "utils/overload.h" + +namespace FlexFlow { + +bool tensor_accessor_all(GenericTensorAccessorR const &t) { + ASSERT(t.data_type == DataType::BOOL); + + return reduce_tensor_accessor_in_all_dims( + t, overload { + [](bool lhs, bool rhs) -> bool { return lhs && rhs; }, + [](auto lhs, auto rhs) -> bool { PANIC(); }, + }); +} + +bool tensor_accessor_any(GenericTensorAccessorR const &t) { + ASSERT(t.data_type == DataType::BOOL); + + return reduce_tensor_accessor_in_all_dims( + t, overload { + [](bool lhs, bool rhs) -> bool { return lhs || rhs; }, + [](auto lhs, auto rhs) -> bool { PANIC(); }, + }); +} + +} // namespace FlexFlow diff --git a/lib/kernels/test/CMakeLists.txt b/lib/kernels/test/CMakeLists.txt index 066cb96753..981f87b3d8 100644 --- a/lib/kernels/test/CMakeLists.txt +++ b/lib/kernels/test/CMakeLists.txt @@ -16,10 +16,3 @@ ff_add_test_executable( cublas pcg ) - -set(FF_TEST_EXEC_NAME "kernels-tests") -add_custom_command( - TARGET ${FF_TEST_EXEC_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -DFF_TEST_EXEC_NAME=${FF_TEST_EXEC_NAME} -P ${CMAKE_CURRENT_LIST_DIR}/modify_test_commands.cmake - DEPENDS ${FF_TEST_EXEC_NAME} -) diff --git a/lib/kernels/test/modify_test_commands.cmake b/lib/kernels/test/modify_test_commands.cmake deleted file mode 100644 index 6494ae2d78..0000000000 --- a/lib/kernels/test/modify_test_commands.cmake +++ /dev/null @@ -1,21 +0,0 @@ -# modify_test_commands.cmake - -file(GLOB ctest_tests_files "${CMAKE_CURRENT_BINARY_DIR}/${FF_TEST_EXEC_NAME}_tests-*.cmake") - -foreach(ctest_tests_file IN LISTS ctest_tests_files) - file(READ "${ctest_tests_file}" content) - - # add nix run prefix - string(REGEX REPLACE - "add_test\\([ \t\r\n]*\\[==\\[([^]]+)\\]==\\][ \t\r\n]+([^ ]+)[ \t\r\n]+\\[==\\[([^]]+)\\]==\\]\\)" - "add_test( [==[\\1]==] nixGL -- \\2 [==[\\3]==])" - content "${content}") - - # add environment - # string(REGEX REPLACE - # "set_tests_properties\\([ \t\r\n]*\\[==\\[([^]]+)\\]==\\][ \t\r\n]+PROPERTIES[ \t\r\n]+([^)]+)\\)" - # "set_tests_properties( [==[\\1]==] PROPERTIES \\2 ENVIRONMENT \"NIXPKGS_ALLOW_UNFREE=1\")" - # content "${content}") - - file(WRITE "${ctest_tests_file}" "${content}") -endforeach() diff --git a/lib/kernels/test/src/cpu/ops/replicate_kernels.cc b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc index 6c35185524..be1e3832ff 100644 --- a/lib/kernels/test/src/cpu/ops/replicate_kernels.cc +++ b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc @@ -15,7 +15,7 @@ TEST_SUITE(FF_TEST_SUITE) { create_1d_accessor_r_with_contents({1, 3, 2}, cpu_allocator); TensorShape result_shape = TensorShape{ - TensorDims{FFOrdered{3_n}}, + TensorDims{FFOrdered{3_p}}, DataType::INT32, }; GenericTensorAccessorW result = @@ -45,7 +45,7 @@ TEST_SUITE(FF_TEST_SUITE) { {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator); TensorShape result_shape = TensorShape{ - TensorDims{FFOrdered{3_n}}, + TensorDims{FFOrdered{3_p}}, DataType::INT32, }; GenericTensorAccessorW result = diff --git a/lib/kernels/test/src/cpu/ops/reverse_kernels.cc b/lib/kernels/test/src/cpu/ops/reverse_kernels.cc index 8c54f4453b..9e0f38c8d6 100644 --- a/lib/kernels/test/src/cpu/ops/reverse_kernels.cc +++ b/lib/kernels/test/src/cpu/ops/reverse_kernels.cc @@ -26,7 +26,7 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorW result = create_zero_filled_accessor_w( TensorShape{ - TensorDims{FFOrdered{2_n, 2_n, 3_n}}, + TensorDims{FFOrdered{2_p, 2_p, 3_p}}, DataType::INT32, }, cpu_allocator); @@ -122,7 +122,7 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorW result = create_zero_filled_accessor_w( TensorShape{ - TensorDims{FFOrdered{2_n, 2_n, 3_n}}, + TensorDims{FFOrdered{2_p, 2_p, 3_p}}, DataType::INT32, }, cpu_allocator); diff --git a/lib/kernels/test/src/internal/test_utils.cc b/lib/kernels/test/src/internal/test_utils.cc index b20ea8ee6b..1d08adb56a 100644 --- a/lib/kernels/test/src/internal/test_utils.cc +++ b/lib/kernels/test/src/internal/test_utils.cc @@ -33,7 +33,7 @@ struct CreateRandomFilledAccessorW { std::random_device rd; std::mt19937 gen(rd()); - size_t num_elements = get_num_elements(shape).unwrap_nonnegative(); + size_t num_elements = get_num_elements(shape).int_from_positive_int(); if constexpr (std::is_same::value) { std::bernoulli_distribution dist(0.5); for (size_t i = 0; i < num_elements; i++) { @@ -80,11 +80,11 @@ struct FillWithZeros { if (accessor.device_type == DeviceType::CPU) { memset(accessor.ptr, 0, - accessor.shape.get_volume().unwrap_nonnegative() * sizeof(T)); + accessor.shape.num_elements().int_from_positive_int() * sizeof(T)); } else { checkCUDA(cudaMemset(accessor.ptr, 0, - accessor.shape.get_volume().unwrap_nonnegative() * + accessor.shape.num_elements().int_from_positive_int() * sizeof(T))); } } @@ -101,7 +101,7 @@ struct CPUAccessorRContainsNonZero { T const *data_ptr = accessor.get
(); - int volume = accessor.shape.num_elements().unwrap_nonnegative(); + int volume = accessor.shape.num_elements().int_from_positive_int(); for (size_t i = 0; i < volume; i++) { if (data_ptr[i] != 0) { return true; @@ -134,7 +134,7 @@ struct AccessorsAreEqual { T const *a_data_ptr = cpu_accessor_a.get
(); T const *b_data_ptr = cpu_accessor_b.get
(); - int volume = accessor_a.shape.num_elements().unwrap_nonnegative(); + int volume = accessor_a.shape.num_elements().int_from_positive_int(); for (size_t i = 0; i < volume; i++) { if (a_data_ptr[i] != b_data_ptr[i]) { return false; @@ -172,7 +172,7 @@ struct CreateFilledAccessorW { T *data_ptr = src_accessor.get
(); - int volume = dst_accessor.shape.num_elements().unwrap_nonnegative(); + int volume = dst_accessor.shape.num_elements().int_from_positive_int(); for (size_t i = 0; i < volume; i++) { data_ptr[i] = unwrapped_value; } diff --git a/lib/kernels/test/src/kernels/accessor.cc b/lib/kernels/test/src/kernels/accessor.cc index 98f8471212..2f7e908e0b 100644 --- a/lib/kernels/test/src/kernels/accessor.cc +++ b/lib/kernels/test/src/kernels/accessor.cc @@ -2,6 +2,7 @@ #include "internal/test_utils.h" #include "kernels/local_cpu_allocator.h" #include +#include "kernels/create_accessor_with_contents.h" using namespace ::FlexFlow; @@ -10,8 +11,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("one dimension") { std::vector indices = {4_n}; ArrayShape shape = ArrayShape{ - std::vector{ - 13_n, + std::vector{ + 13_p, }, }; @@ -24,9 +25,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("multiple dimensions") { std::vector indices = {2_n, 4_n}; ArrayShape shape = ArrayShape{ - std::vector{ - 6_n, - 5_n, + std::vector{ + 6_p, + 5_p, }, }; @@ -38,7 +39,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("zero dimensions") { std::vector indices = {}; - ArrayShape shape = ArrayShape{std::vector{}}; + ArrayShape shape = ArrayShape{std::vector{}}; nonnegative_int result = calculate_accessor_offset(indices, shape); nonnegative_int correct = 0_n; @@ -49,9 +50,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("index and shape dimensions do not match") { std::vector indices = {1_n, 2_n, 4_n}; ArrayShape shape = ArrayShape{ - std::vector{ - 6_n, - 5_n, + std::vector{ + 6_p, + 5_p, }, }; @@ -61,13 +62,58 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("out of bounds index") { std::vector indices = {2_n, 5_n}; ArrayShape shape = ArrayShape{ - std::vector{ - 6_n, - 5_n, + std::vector{ + 6_p, + 5_p, }, }; CHECK_THROWS(calculate_accessor_offset(indices, shape)); } } + + TEST_CASE("accessor_get_only_value") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + SUBCASE("returns the value if the accessor only contains one value") { + GenericTensorAccessorR input = create_3d_accessor_r_with_contents( + { + { + {12}, + }, + }, + cpu_allocator); + + float result = accessor_get_only_value(input); + float correct = 12; + + CHECK(result == correct); + } + + + SUBCASE("throws an error if the requested type does not match the type in the accessor") { + GenericTensorAccessorR input = create_3d_accessor_r_with_contents( + { + { + {12}, + }, + }, + cpu_allocator); + + CHECK_THROWS(accessor_get_only_value(input)); + } + + SUBCASE("throws an error if the accessor contains multiple values") { + GenericTensorAccessorR input = create_3d_accessor_r_with_contents( + { + { + {12}, + {12}, + }, + }, + cpu_allocator); + + CHECK_THROWS(accessor_get_only_value(input)); + } + } } diff --git a/lib/kernels/test/src/kernels/array_shape.cc b/lib/kernels/test/src/kernels/array_shape.cc index 1fb4c0b541..2665cdda36 100644 --- a/lib/kernels/test/src/kernels/array_shape.cc +++ b/lib/kernels/test/src/kernels/array_shape.cc @@ -8,7 +8,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_array_coord_set") { SUBCASE("ArrayShape is not empty") { ArrayShape input = ArrayShape{ - LegionOrdered{2_n, 1_n, 3_n}, + LegionOrdered{2_p, 1_p, 3_p}, }; std::unordered_set result = get_array_coord_set(input); @@ -24,26 +24,67 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(result == correct); } - SUBCASE("ArrayShape has a dimension of size zero") { - ArrayShape input = ArrayShape{ - LegionOrdered{2_n, 0_n, 3_n}, - }; + SUBCASE("ArrayShape is zero-dimensional") { + ArrayShape input = ArrayShape{LegionOrdered{}}; std::unordered_set result = get_array_coord_set(input); - std::unordered_set correct = {}; + std::unordered_set correct = { + ArrayCoord{FFOrdered{}}, + }; CHECK(result == correct); } + } - SUBCASE("ArrayShape is zero-dimensional") { - ArrayShape input = ArrayShape{LegionOrdered{}}; + TEST_CASE("array_shape_drop_dims") { + ArrayShape input = ArrayShape{ + LegionOrdered{2_p, 4_p, 3_p}, + }; - std::unordered_set result = get_array_coord_set(input); - std::unordered_set correct = { - ArrayCoord{FFOrdered{}}, + SUBCASE("removes dims specified to be dropped") { + auto should_drop_dim = [](ff_dim_t dim) -> bool { + return dim.value % 2_n == 0; + }; + + ArrayShape result = array_shape_drop_dims(input, should_drop_dim); + ArrayShape correct = ArrayShape{ + LegionOrdered{4_p}, }; CHECK(result == correct); } + + SUBCASE("is identity function if no dimensions are specified to be dropped") { + auto should_drop_dim = [](ff_dim_t dim) -> bool { + return false; + }; + + ArrayShape result = array_shape_drop_dims(input, should_drop_dim); + ArrayShape correct = input; + + CHECK(result == correct); + } + + SUBCASE("is identity function if no dimensions are specified to be dropped") { + auto should_drop_dim = [](ff_dim_t dim) -> bool { + return false; + }; + + ArrayShape result = array_shape_drop_dims(input, should_drop_dim); + ArrayShape correct = input; + + CHECK(result == correct); + } + + SUBCASE("returns empty shape if all dimensions are specified to be dropped") { + auto should_drop_dim = [](ff_dim_t dim) -> bool { + return true; + }; + + ArrayShape result = array_shape_drop_dims(input, should_drop_dim); + ArrayShape correct = ArrayShape{LegionOrdered{}}; + + CHECK(result == correct); + } } } diff --git a/lib/kernels/test/src/kernels/compare_tensor_accessors.cc b/lib/kernels/test/src/kernels/compare_tensor_accessors.cc index d5124180af..54706ad74e 100644 --- a/lib/kernels/test/src/kernels/compare_tensor_accessors.cc +++ b/lib/kernels/test/src/kernels/compare_tensor_accessors.cc @@ -54,4 +54,167 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK_MESSAGE(accessors_are_equal(result, correct), check_kv("result", format_accessor_w_contents(result))); } + + TEST_CASE("compare_tensor_accessors_le") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorR lhs = create_3d_accessor_r_with_contents( + { + { + {4, 2, 1}, + }, + { + {2, 1, 5}, + }, + }, + cpu_allocator); + + GenericTensorAccessorR rhs = create_3d_accessor_r_with_contents( + { + { + {5, 1, 0}, + }, + { + {2, 1, 5}, + }, + }, + cpu_allocator); + + GenericTensorAccessorW result = compare_tensor_accessors_le(lhs, rhs, cpu_allocator); + GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( + { + { + {true, false, false}, + }, + { + {true, true, true}, + }, + }, + cpu_allocator); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + check_kv("result", format_accessor_w_contents(result))); + } + + TEST_CASE("compare_tensor_accessors_gt") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorR lhs = create_2d_accessor_r_with_contents( + { + {4, 2, 1}, + {2, 1, 5}, + }, + cpu_allocator); + + GenericTensorAccessorR rhs = create_2d_accessor_r_with_contents( + { + {5, 1, 0}, + {2, 1, 5}, + }, + cpu_allocator); + + GenericTensorAccessorW result = compare_tensor_accessors_gt(lhs, rhs, cpu_allocator); + GenericTensorAccessorR correct = create_2d_accessor_r_with_contents( + { + {false, true, true}, + {false, false, false}, + }, + cpu_allocator); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + check_kv("result", format_accessor_w_contents(result))); + } + + TEST_CASE("compare_tensor_accessors_ge") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorR lhs = create_2d_accessor_r_with_contents( + { + {4, 2}, + {2, 5}, + {1, 8}, + }, + cpu_allocator); + + GenericTensorAccessorR rhs = create_2d_accessor_r_with_contents( + { + {5, 1}, + {3, 6}, + {1, 0}, + }, + cpu_allocator); + + GenericTensorAccessorW result = compare_tensor_accessors_ge(lhs, rhs, cpu_allocator); + GenericTensorAccessorR correct = create_2d_accessor_r_with_contents( + { + {false, true}, + {false, false}, + {true, true}, + }, + cpu_allocator); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + check_kv("result", format_accessor_w_contents(result))); + } + + TEST_CASE("compare_tensor_accessors_eq") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorR lhs = create_2d_accessor_r_with_contents( + { + {4, 2}, + {1, 8}, + }, + cpu_allocator); + + GenericTensorAccessorR rhs = create_2d_accessor_r_with_contents( + { + {5, 2}, + {1, 8}, + }, + cpu_allocator); + + GenericTensorAccessorW result = compare_tensor_accessors_eq(lhs, rhs, cpu_allocator); + GenericTensorAccessorR correct = create_2d_accessor_r_with_contents( + { + {false, true}, + {true, true}, + }, + cpu_allocator); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + check_kv("result", format_accessor_w_contents(result))); + } + + TEST_CASE("compare_tensor_accessors_ne") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorR lhs = create_2d_accessor_r_with_contents( + { + {4, 2}, + {1, 8}, + {1, 2}, + }, + cpu_allocator); + + GenericTensorAccessorR rhs = create_2d_accessor_r_with_contents( + { + {5, 2}, + {1, 8}, + {2, 2}, + }, + cpu_allocator); + + GenericTensorAccessorW result = compare_tensor_accessors_ne(lhs, rhs, cpu_allocator); + GenericTensorAccessorR correct = create_2d_accessor_r_with_contents( + { + {true, false}, + {false, false}, + {true, false}, + }, + cpu_allocator); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + check_kv("result", format_accessor_w_contents(result))); + } } diff --git a/lib/kernels/test/src/kernels/create_accessor_with_contents.cc b/lib/kernels/test/src/kernels/create_accessor_with_contents.cc new file mode 100644 index 0000000000..a6cfdbc97f --- /dev/null +++ b/lib/kernels/test/src/kernels/create_accessor_with_contents.cc @@ -0,0 +1,133 @@ +#include +#include "kernels/create_accessor_with_contents.h" + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("create_1d_accessor_w_with_contents") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorW result + = create_1d_accessor_w_with_contents({1, 4, 1, 2}, cpu_allocator); + + auto at = [&](nonnegative_int c) -> float { + return result.at(FFOrdered{c}); + }; + + CHECK(at(0_n) == 1); + CHECK(at(1_n) == 4); + CHECK(at(2_n) == 1); + CHECK(at(3_n) == 2); + } + + TEST_CASE("create_2d_accessor_w_with_contents") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorW result + = create_2d_accessor_w_with_contents( + { + {1, 4, 2}, + {2, 2, 7}, + }, + cpu_allocator); + + auto at = [&](nonnegative_int r, nonnegative_int c) -> float { + return result.at(FFOrdered{r, c}); + }; + + CHECK(at(0_n, 0_n) == 1); + CHECK(at(0_n, 1_n) == 4); + CHECK(at(0_n, 2_n) == 2); + CHECK(at(1_n, 0_n) == 2); + CHECK(at(1_n, 1_n) == 2); + CHECK(at(1_n, 2_n) == 7); + } + + TEST_CASE("create_3d_accessor_w_with_contents") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorW result + = create_3d_accessor_w_with_contents( + { + { + {1, 4}, + {2, 3}, + {7, 2}, + }, + { + {9, 3}, + {4, 5}, + {0, 2}, + }, + }, + cpu_allocator); + + auto at = [&](nonnegative_int s, nonnegative_int r, nonnegative_int c) -> float { + return result.at(FFOrdered{s, r, c}); + }; + + CHECK(at(0_n, 0_n, 0_n) == 1); + CHECK(at(0_n, 0_n, 1_n) == 4); + CHECK(at(0_n, 1_n, 0_n) == 2); + CHECK(at(0_n, 1_n, 1_n) == 3); + CHECK(at(0_n, 2_n, 0_n) == 7); + CHECK(at(0_n, 2_n, 1_n) == 2); + CHECK(at(1_n, 0_n, 0_n) == 9); + CHECK(at(1_n, 0_n, 1_n) == 3); + CHECK(at(1_n, 1_n, 0_n) == 4); + CHECK(at(1_n, 1_n, 1_n) == 5); + CHECK(at(1_n, 2_n, 0_n) == 0); + CHECK(at(1_n, 2_n, 1_n) == 2); + } + + TEST_CASE("create_4d_accessor_w_with_contents") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorW result + = create_4d_accessor_w_with_contents( + { + { + { + {2, 3}, + {7, 2}, + }, + { + {4, 5}, + {0, 2}, + }, + }, + { + { + {9, 6}, + {1, 2}, + }, + { + {8, 7}, + {3, 8}, + }, + }, + }, + cpu_allocator); + + auto at = [&](nonnegative_int s1, nonnegative_int s2, nonnegative_int r, nonnegative_int c) -> float { + return result.at(FFOrdered{s1, s2, r, c}); + }; + + CHECK(at(0_n, 0_n, 0_n, 0_n) == 2); + CHECK(at(0_n, 0_n, 0_n, 1_n) == 3); + CHECK(at(0_n, 0_n, 1_n, 0_n) == 7); + CHECK(at(0_n, 0_n, 1_n, 1_n) == 2); + CHECK(at(0_n, 1_n, 0_n, 0_n) == 4); + CHECK(at(0_n, 1_n, 0_n, 1_n) == 5); + CHECK(at(0_n, 1_n, 1_n, 0_n) == 0); + CHECK(at(0_n, 1_n, 1_n, 1_n) == 2); + CHECK(at(1_n, 0_n, 0_n, 0_n) == 9); + CHECK(at(1_n, 0_n, 0_n, 1_n) == 6); + CHECK(at(1_n, 0_n, 1_n, 0_n) == 1); + CHECK(at(1_n, 0_n, 1_n, 1_n) == 2); + CHECK(at(1_n, 1_n, 0_n, 0_n) == 8); + CHECK(at(1_n, 1_n, 0_n, 1_n) == 7); + CHECK(at(1_n, 1_n, 1_n, 0_n) == 3); + CHECK(at(1_n, 1_n, 1_n, 1_n) == 8); + } +} diff --git a/lib/kernels/test/src/kernels/map_tensor_accessors.cc b/lib/kernels/test/src/kernels/map_tensor_accessors.cc new file mode 100644 index 0000000000..fcc59b7935 --- /dev/null +++ b/lib/kernels/test/src/kernels/map_tensor_accessors.cc @@ -0,0 +1,151 @@ +#include +#include "kernels/map_tensor_accessors.h" +#include "kernels/create_accessor_with_contents.h" + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("map_tensor_accessor_inplace") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorW accessor = create_2d_accessor_w_with_contents( + { + {1, 3, 2}, + {2, 1, 5}, + }, + cpu_allocator); + + map_tensor_accessor_inplace(accessor, [](float x) { return x + 1; }); + + auto at = [&](nonnegative_int r, nonnegative_int c) -> float { + return accessor.at(FFOrdered{r, c}); + }; + + CHECK(at(0_n, 0_n) == 2); + CHECK(at(0_n, 1_n) == 4); + CHECK(at(0_n, 2_n) == 3); + CHECK(at(1_n, 0_n) == 3); + CHECK(at(1_n, 1_n) == 2); + CHECK(at(1_n, 2_n) == 6); + } + + TEST_CASE("map_tensor_accessor") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorW input = create_2d_accessor_w_with_contents( + { + {1, 3, 2}, + {2, 1, 5}, + }, + cpu_allocator); + + SUBCASE("function is not type changing") { + GenericTensorAccessorW result = map_tensor_accessor(input, [](float x) { return x + 1; }, cpu_allocator); + + auto at = [&](nonnegative_int r, nonnegative_int c) -> float { + return result.at(FFOrdered{r, c}); + }; + + CHECK(at(0_n, 0_n) == 2); + CHECK(at(0_n, 1_n) == 4); + CHECK(at(0_n, 2_n) == 3); + CHECK(at(1_n, 0_n) == 3); + CHECK(at(1_n, 1_n) == 2); + CHECK(at(1_n, 2_n) == 6); + } + + SUBCASE("function is type changing") { + GenericTensorAccessorW result = map_tensor_accessor(input, [](float x) -> bool { return x > 2; }, cpu_allocator); + + auto at = [&](nonnegative_int r, nonnegative_int c) -> bool { + return result.at(FFOrdered{r, c}); + }; + + CHECK(at(0_n, 0_n) == false); + CHECK(at(0_n, 1_n) == true); + CHECK(at(0_n, 2_n) == false); + CHECK(at(1_n, 0_n) == false); + CHECK(at(1_n, 1_n) == false); + CHECK(at(1_n, 2_n) == true); + } + } + + TEST_CASE("map_tensor_accessors2") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorW lhs = create_2d_accessor_w_with_contents( + { + {1, 3, 2}, + {2, 1, 5}, + }, + cpu_allocator); + + SUBCASE("argument types are the same") { + GenericTensorAccessorW rhs = create_2d_accessor_w_with_contents( + { + {0, 2, 5}, + {3, 3, 8}, + }, + cpu_allocator); + + SUBCASE("function is not type changing") { + GenericTensorAccessorW result = map_tensor_accessors2(lhs, rhs, DataType::FLOAT, [](float l, float r) { return l + 2 * r; }, cpu_allocator); + + auto at = [&](nonnegative_int r, nonnegative_int c) -> float { + return result.at(FFOrdered{r, c}); + }; + + CHECK(at(0_n, 0_n) == 1); + CHECK(at(0_n, 1_n) == 7); + CHECK(at(0_n, 2_n) == 12); + CHECK(at(1_n, 0_n) == 8); + CHECK(at(1_n, 1_n) == 7); + CHECK(at(1_n, 2_n) == 21); + } + + SUBCASE("function is type changing") { + GenericTensorAccessorW result = map_tensor_accessors2(lhs, rhs, DataType::BOOL, [](float l, float r) -> bool { return l > r; }, cpu_allocator); + + auto at = [&](nonnegative_int r, nonnegative_int c) -> bool { + return result.at(FFOrdered{r, c}); + }; + + CHECK(at(0_n, 0_n) == true); + CHECK(at(0_n, 1_n) == true); + CHECK(at(0_n, 2_n) == false); + CHECK(at(1_n, 0_n) == false); + CHECK(at(1_n, 1_n) == false); + CHECK(at(1_n, 2_n) == false); + } + } + + SUBCASE("argument types are not the same") { + GenericTensorAccessorW rhs = create_2d_accessor_w_with_contents( + { + {true, false, true}, + {true, false, false}, + }, + cpu_allocator); + + auto func = [](float l, bool r) -> double { + if (r) { + return (- l); + } else { + return l * 2; + } + }; + GenericTensorAccessorW result = map_tensor_accessors2(lhs, rhs, DataType::DOUBLE, func, cpu_allocator); + + auto at = [&](nonnegative_int r, nonnegative_int c) -> double { + return result.at(FFOrdered{r, c}); + }; + + CHECK(at(0_n, 0_n) == -1); + CHECK(at(0_n, 1_n) == 6); + CHECK(at(0_n, 2_n) == -2); + CHECK(at(1_n, 0_n) == -2); + CHECK(at(1_n, 1_n) == 2); + CHECK(at(1_n, 2_n) == 10); + } + } +} diff --git a/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc b/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc new file mode 100644 index 0000000000..0e69b3b937 --- /dev/null +++ b/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc @@ -0,0 +1,68 @@ +#include +#include "kernels/reduce_tensor_accessor.h" +#include "internal/test_utils.h" +#include "kernels/format_accessor_contents.h" +#include "test/utils/doctest/check_kv.h" +#include "kernels/create_accessor_with_contents.h" + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("reduce_tensor_accessor_in_dims") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorR accessor = create_3d_accessor_r_with_contents( + { + { + {1, 3, 2}, + {2, 1, 5}, + }, + { + {4, 2, 1}, + {8, 3, 6}, + }, + }, + cpu_allocator); + + GenericTensorAccessorW result = reduce_tensor_accessor_in_dims( + accessor, + {ff_dim_t{0_n}, ff_dim_t{2_n}}, + cpu_allocator, + [](int32_t accum, int32_t x) { return x + accum; }); + + GenericTensorAccessorW correct = create_1d_accessor_w_with_contents( + { + 1 + 3 + 2 + 4 + 2 + 1, + 2 + 1 + 5 + 8 + 3 + 6, + }, + cpu_allocator); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + check_kv("result =", format_accessor_w_contents(result)), + check_kv("correct=", format_accessor_w_contents(correct))); + } + + + TEST_CASE("reduce_tensor_accessor_in_all_dims") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorR accessor = create_3d_accessor_r_with_contents( + { + { + {1, 3, 2}, + {2, 1, 5}, + }, + { + {4, 2, 1}, + {8, 3, 6}, + }, + }, + cpu_allocator); + + int32_t result = reduce_tensor_accessor_in_all_dims( + accessor, [](int32_t accum, int32_t elem) { return accum + elem; }); + int32_t correct = 1 + 3 + 2 + 2 + 1 + 5 + 4 + 2 + 1 + 8 + 3 + 6; + + CHECK(result == correct); + } +} diff --git a/lib/kernels/test/src/kernels/tensor_accessor_reductions.cc b/lib/kernels/test/src/kernels/tensor_accessor_reductions.cc new file mode 100644 index 0000000000..744b875ee7 --- /dev/null +++ b/lib/kernels/test/src/kernels/tensor_accessor_reductions.cc @@ -0,0 +1,106 @@ +#include +#include "kernels/create_accessor_with_contents.h" +#include "kernels/local_cpu_allocator.h" +#include "kernels/tensor_accessor_reductions.h" + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("tensor_accessor_all") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + SUBCASE("returns false if any elements are false") { + GenericTensorAccessorR accessor = create_3d_accessor_r_with_contents( + { + { + {true, true, true}, + {true, true, true}, + }, + { + {true, false, true}, + {true, true, true}, + }, + }, + cpu_allocator); + + bool result = tensor_accessor_all(accessor); + bool correct = false; + + CHECK(result == correct); + } + + SUBCASE("returns true if all elements are true") { + GenericTensorAccessorR accessor = create_2d_accessor_r_with_contents( + { + {true, true, true}, + {true, true, true}, + }, + cpu_allocator); + + bool result = tensor_accessor_all(accessor); + bool correct = true; + + CHECK(result == correct); + } + + SUBCASE("throw an error if the datatype is not bool") { + GenericTensorAccessorR accessor = create_2d_accessor_r_with_contents( + { + {1, 0, 1}, + {1, 1, 1}, + }, + cpu_allocator); + + CHECK_THROWS(tensor_accessor_all(accessor)); + } + } + + TEST_CASE("tensor_accessor_any") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + SUBCASE("returns true if any elements are true") { + GenericTensorAccessorR accessor = create_3d_accessor_r_with_contents( + { + { + {false, false, false}, + {true, false, false}, + }, + { + {false, false, false}, + {false, false, false}, + }, + }, + cpu_allocator); + + bool result = tensor_accessor_any(accessor); + bool correct = true; + + CHECK(result == correct); + } + + SUBCASE("returns false if all elements are false") { + GenericTensorAccessorR accessor = create_2d_accessor_r_with_contents( + { + {false, false, false}, + {false, false, false}, + }, + cpu_allocator); + + bool result = tensor_accessor_any(accessor); + bool correct = false; + + CHECK(result == correct); + } + + SUBCASE("throw an error if the datatype is not bool") { + GenericTensorAccessorR accessor = create_2d_accessor_r_with_contents( + { + {1, 0, 1}, + {1, 1, 1}, + }, + cpu_allocator); + + CHECK_THROWS(tensor_accessor_any(accessor)); + } + } +} diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc index 3a0f4ffdc4..3b024fdf55 100644 --- a/lib/kernels/test/src/test_attention_kernel.cc +++ b/lib/kernels/test/src/test_attention_kernel.cc @@ -6,17 +6,17 @@ using namespace ::FlexFlow; TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test multi-head attention kernel") { - nonnegative_int num_samples = 10_n; - nonnegative_int num_heads = 4_n; - nonnegative_int qSize = 64_n; - nonnegative_int kSize = 64_n; - nonnegative_int vSize = 64_n; - nonnegative_int qProjSize = 64_n; - nonnegative_int kProjSize = 64_n; - nonnegative_int vProjSize = 64_n; - nonnegative_int oProjSize = 64_n; - nonnegative_int qoSeqLength = 20_n; - nonnegative_int kvSeqLength = 20_n; + positive_int num_samples = 10_p; + positive_int num_heads = 4_p; + positive_int qSize = 64_p; + positive_int kSize = 64_p; + positive_int vSize = 64_p; + positive_int qProjSize = 64_p; + positive_int kProjSize = 64_p; + positive_int vProjSize = 64_p; + positive_int oProjSize = 64_p; + positive_int qoSeqLength = 20_p; + positive_int kvSeqLength = 20_p; ManagedFFStream managed_stream{}; ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( @@ -29,17 +29,17 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { MHAPerDeviceState state = Kernels::MultiHeadAttention::init_kernel( managed_handle.raw_handle(), allocator, - /*num_samples=*/num_samples.unwrap_nonnegative(), - /*num_heads=*/num_heads.unwrap_nonnegative(), - /*qSize=*/qSize.unwrap_nonnegative(), - /*kSize=*/kSize.unwrap_nonnegative(), - /*vSize=*/vSize.unwrap_nonnegative(), - /*qProjSize=*/qProjSize.unwrap_nonnegative(), - /*kProjSize=*/kProjSize.unwrap_nonnegative(), - /*vProjSize=*/vProjSize.unwrap_nonnegative(), - /*oProjSize=*/oProjSize.unwrap_nonnegative(), - /*qoSeqLength=*/qoSeqLength.unwrap_nonnegative(), - /*kvSeqLength=*/kvSeqLength.unwrap_nonnegative(), + /*num_samples=*/num_samples.int_from_positive_int(), + /*num_heads=*/num_heads.int_from_positive_int(), + /*qSize=*/qSize.int_from_positive_int(), + /*kSize=*/kSize.int_from_positive_int(), + /*vSize=*/vSize.int_from_positive_int(), + /*qProjSize=*/qProjSize.int_from_positive_int(), + /*kProjSize=*/kProjSize.int_from_positive_int(), + /*vProjSize=*/vProjSize.int_from_positive_int(), + /*oProjSize=*/oProjSize.int_from_positive_int(), + /*qoSeqLength=*/qoSeqLength.int_from_positive_int(), + /*kvSeqLength=*/kvSeqLength.int_from_positive_int(), /*add_bias_kv=*/false); TensorShape query_shape = TensorShape{ @@ -59,7 +59,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { DataType::FLOAT, }; TensorShape weight_shape = TensorShape{ - TensorDims{FFOrdered{nonnegative_int{state.weightSize}}}, + TensorDims{FFOrdered{positive_int{state.weightSize}}}, DataType::FLOAT, }; diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc index e10a80b57f..4ca8811b9b 100644 --- a/lib/kernels/test/src/test_batch_matmul_kernel.cc +++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc @@ -6,10 +6,10 @@ using namespace ::FlexFlow; TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test BatchMatmul Kernel") { - nonnegative_int m = 10_n; - nonnegative_int n = 10_n; - nonnegative_int k = 10_n; - nonnegative_int batch = 5_n; + positive_int m = 10_p; + positive_int n = 10_p; + positive_int k = 10_p; + positive_int batch = 5_p; int a_seq_length_dim = -1; int b_seq_length_dim = -1; int seq_length = -1; @@ -48,10 +48,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { output_accessor.get_float_ptr(), a_accessor.get_float_ptr(), b_accessor.get_float_ptr(), - m.unwrap_nonnegative(), - n.unwrap_nonnegative(), - k.unwrap_nonnegative(), - batch.unwrap_nonnegative(), + m.int_from_positive_int(), + n.int_from_positive_int(), + k.int_from_positive_int(), + batch.int_from_positive_int(), a_seq_length_dim, b_seq_length_dim, seq_length); @@ -73,10 +73,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { a_grad_accessor.get_float_ptr(), b_accessor.get_float_ptr(), b_grad_accessor.get_float_ptr(), - m.unwrap_nonnegative(), - n.unwrap_nonnegative(), - k.unwrap_nonnegative(), - batch.unwrap_nonnegative()); + m.int_from_positive_int(), + n.int_from_positive_int(), + k.int_from_positive_int(), + batch.int_from_positive_int()); } } } diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index c9a1bf05e6..00a26c3303 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -7,10 +7,10 @@ using namespace ::FlexFlow; TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test BatchNorm Kernel") { - nonnegative_int output_n = 1_n; - nonnegative_int output_c = 10_n; - nonnegative_int output_h = 10_n; - nonnegative_int output_w = 10_n; + positive_int output_n = 1_p; + positive_int output_c = 10_p; + positive_int output_h = 10_p; + positive_int output_w = 10_p; ManagedFFStream managed_stream{}; ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( @@ -24,10 +24,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { /*handle=*/managed_handle.raw_handle(), /*allocator=*/allocator, /*runningMean=*/nullptr, - /*output_n=*/output_n.unwrap_nonnegative(), - /*output_c=*/output_c.unwrap_nonnegative(), - /*output_h=*/output_h.unwrap_nonnegative(), - /*output_w=*/output_w.unwrap_nonnegative(), + /*output_n=*/output_n.int_from_positive_int(), + /*output_c=*/output_c.int_from_positive_int(), + /*output_h=*/output_h.int_from_positive_int(), + /*output_w=*/output_w.int_from_positive_int(), /*relu=*/true); TensorShape input_shape = TensorShape{ @@ -90,7 +90,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { /*scale_grad_ptr=*/scale_grad_accessor.get_float_ptr(), /*bias_grad_ptr=*/bias_grad_accessor.get_float_ptr(), /*numElements=*/ - input_accessor.shape.num_elements().unwrap_nonnegative()); + input_accessor.shape.num_elements().int_from_positive_int()); CHECK(contains_non_zero(input_grad_accessor)); CHECK(contains_non_zero(scale_grad_accessor)); diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index 0c41fe12ac..7539b2457c 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -11,11 +11,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{100_n, 100_n}}, + TensorDims{FFOrdered{100_p, 100_p}}, DataType::FLOAT, }; TensorShape output_shape = TensorShape{ - TensorDims{FFOrdered{100_n, 100_n}}, + TensorDims{FFOrdered{100_p, 100_p}}, DataType::DOUBLE, }; @@ -52,11 +52,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{10_n, 2_n}}, + TensorDims{FFOrdered{10_p, 2_p}}, DataType::FLOAT, }; TensorShape output_shape = TensorShape{ - TensorDims{FFOrdered{10_n, 2_n}}, + TensorDims{FFOrdered{10_p, 2_p}}, DataType::DOUBLE, }; diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc index ddcb0d8c49..6ce415d48c 100644 --- a/lib/kernels/test/src/test_combine_kernel.cc +++ b/lib/kernels/test/src/test_combine_kernel.cc @@ -15,7 +15,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{100_n, 100_n}}, + TensorDims{FFOrdered{100_p, 100_p}}, DataType::FLOAT, }; TensorShape output_shape = input_shape; @@ -53,7 +53,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{5_n, 5_n}}, + TensorDims{FFOrdered{5_p, 5_p}}, DataType::FLOAT, }; TensorShape output_shape = input_shape; diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc index 20ebb52161..b22add8905 100644 --- a/lib/kernels/test/src/test_concat_kernel.cc +++ b/lib/kernels/test/src/test_concat_kernel.cc @@ -13,11 +13,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - const nonnegative_int num_inputs = 4_n; + const positive_int num_inputs = 4_p; SUBCASE("forward_kernel") { - auto run_forward_test = [&](nonnegative_int input_rows, - nonnegative_int input_cols, + auto run_forward_test = [&](positive_int input_rows, + positive_int input_cols, TensorShape output_shape, ff_dim_t concat_axis) { TensorShape input_shape = TensorShape{ @@ -26,7 +26,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { }; std::vector input_accessors = - repeat(num_inputs, [&]() { + repeat(num_inputs.nonnegative_int_from_positive_int(), [&]() { return create_random_filled_accessor_r(input_shape, allocator); }); @@ -42,8 +42,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { }; SUBCASE("test forward concat, axis = 0") { - nonnegative_int input_rows = 2_n; - nonnegative_int input_cols = 4_n; + positive_int input_rows = 2_p; + positive_int input_cols = 4_p; TensorShape output_shape = TensorShape{ TensorDims{FFOrdered{num_inputs * input_rows, input_cols}}, DataType::FLOAT, @@ -52,8 +52,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { } SUBCASE("test forward concat, axis = 1") { - nonnegative_int input_rows = 4_n; - nonnegative_int input_cols = 2_n; + positive_int input_rows = 4_p; + positive_int input_cols = 2_p; TensorShape output_shape = TensorShape{ TensorDims{FFOrdered{input_rows, num_inputs * input_cols}}, DataType::FLOAT, @@ -63,8 +63,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { } SUBCASE("backward_kernel") { - auto run_backward_test = [&](nonnegative_int input_rows, - nonnegative_int input_cols, + auto run_backward_test = [&](positive_int input_rows, + positive_int input_cols, TensorShape output_shape, ff_dim_t concat_axis) { TensorShape input_shape = TensorShape{ @@ -76,7 +76,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { create_random_filled_accessor_r(output_shape, allocator); std::vector input_grad_accessors = - repeat(num_inputs, [&]() { + repeat(num_inputs.nonnegative_int_from_positive_int(), [&]() { return create_zero_filled_accessor_w(input_shape, allocator); }); @@ -91,8 +91,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { }; SUBCASE("test backward concat, axis = 0") { - nonnegative_int input_rows = 2_n; - nonnegative_int input_cols = 4_n; + positive_int input_rows = 2_p; + positive_int input_cols = 4_p; TensorShape output_shape = TensorShape{ TensorDims{FFOrdered{num_inputs * input_rows, input_cols}}, DataType::FLOAT, @@ -101,8 +101,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { } SUBCASE("test backward concat, axis = 1") { - nonnegative_int input_rows = 4_n; - nonnegative_int input_cols = 2_n; + positive_int input_rows = 4_p; + positive_int input_cols = 2_p; TensorShape output_shape = TensorShape{ TensorDims{FFOrdered{input_rows, num_inputs * input_cols}}, DataType::FLOAT, diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index 8379e062d5..1b224084f8 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -10,11 +10,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { float dropout_rate = 0.1; ArrayShape shape = ArrayShape{ - std::vector{10_n, 10_n}, + std::vector{10_p, 10_p}, }; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{10_n, 10_n}}, + TensorDims{FFOrdered{10_p, 10_p}}, DataType::FLOAT, }; TensorShape output_shape = input_shape; diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc index dd44b8f50c..98896cca18 100644 --- a/lib/kernels/test/src/test_flat_kernel.cc +++ b/lib/kernels/test/src/test_flat_kernel.cc @@ -15,7 +15,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { ManagedFFStream managed_stream{}; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{100_n}}, + TensorDims{FFOrdered{100_p}}, DataType::FLOAT, }; TensorShape output_shape = input_shape; diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc index c387899709..52389ea0f5 100644 --- a/lib/kernels/test/src/test_gather_kernels.cc +++ b/lib/kernels/test/src/test_gather_kernels.cc @@ -39,15 +39,15 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { SUBCASE("test gather forward, 2D") { TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{2_n, 100_n}}, + TensorDims{FFOrdered{2_p, 100_p}}, DataType::FLOAT, }; TensorShape index_shape = TensorShape{ - TensorDims{FFOrdered{2_n, 20_n}}, + TensorDims{FFOrdered{2_p, 20_p}}, DataType::INT32, }; TensorShape output_shape = TensorShape{ - TensorDims{FFOrdered{2_n, 20_n}}, + TensorDims{FFOrdered{2_p, 20_p}}, DataType::FLOAT, }; run_forward_test(input_shape, index_shape, output_shape); @@ -55,15 +55,15 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { SUBCASE("test gather forward, 1D") { TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{100_n}}, + TensorDims{FFOrdered{100_p}}, DataType::FLOAT, }; TensorShape index_shape = TensorShape{ - TensorDims{FFOrdered{10_n}}, + TensorDims{FFOrdered{10_p}}, DataType::INT32, }; TensorShape output_shape = TensorShape{ - TensorDims{FFOrdered{10_n}}, + TensorDims{FFOrdered{10_p}}, DataType::FLOAT, }; run_forward_test(input_shape, index_shape, output_shape); @@ -91,15 +91,15 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { SUBCASE("test gather backward, 2D") { TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{2_n, 100_n}}, + TensorDims{FFOrdered{2_p, 100_p}}, DataType::FLOAT, }; TensorShape index_shape = TensorShape{ - TensorDims{FFOrdered{2_n, 25_n}}, + TensorDims{FFOrdered{2_p, 25_p}}, DataType::INT32, }; TensorShape output_shape = TensorShape{ - TensorDims{FFOrdered{2_n, 25_n}}, + TensorDims{FFOrdered{2_p, 25_p}}, DataType::FLOAT, }; run_backward_test(input_shape, index_shape, output_shape); diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc index eb62784369..4f3b701bba 100644 --- a/lib/kernels/test/src/test_layer_norm_kernels.cc +++ b/lib/kernels/test/src/test_layer_norm_kernels.cc @@ -7,8 +7,8 @@ using namespace ::FlexFlow; TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test LayerNorm Forward and Backward Kernel") { - nonnegative_int batch_size = 10_n; - nonnegative_int feature_size = 10_n; + positive_int batch_size = 10_p; + positive_int feature_size = 10_p; float epsilon = 1e-5f; bool elementwise_affine = true; @@ -34,8 +34,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { Kernels::LayerNorm::init_kernel(managed_handle.raw_handle(), allocator, elementwise_affine, - batch_size.unwrap_nonnegative(), - feature_size.unwrap_nonnegative(), + batch_size.int_from_positive_int(), + feature_size.int_from_positive_int(), epsilon); GenericTensorAccessorR input_accessor = diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc index 9243601766..099536ce0d 100644 --- a/lib/kernels/test/src/test_managed_ff_stream.cc +++ b/lib/kernels/test/src/test_managed_ff_stream.cc @@ -38,15 +38,15 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { SUBCASE("test gather forward, 2D") { TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{2_n, 100_n}}, + TensorDims{FFOrdered{2_p, 100_p}}, DataType::FLOAT, }; TensorShape index_shape = TensorShape{ - TensorDims{FFOrdered{2_n, 20_n}}, + TensorDims{FFOrdered{2_p, 20_p}}, DataType::INT32, }; TensorShape output_shape = TensorShape{ - TensorDims{FFOrdered{2_n, 20_n}}, + TensorDims{FFOrdered{2_p, 20_p}}, DataType::FLOAT, }; run_forward_test(input_shape, index_shape, output_shape); @@ -54,15 +54,15 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { SUBCASE("test gather forward, 1D") { TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{100_n}}, + TensorDims{FFOrdered{100_p}}, DataType::FLOAT, }; TensorShape index_shape = TensorShape{ - TensorDims{FFOrdered{10_n}}, + TensorDims{FFOrdered{10_p}}, DataType::INT32, }; TensorShape output_shape = TensorShape{ - TensorDims{FFOrdered{10_n}}, + TensorDims{FFOrdered{10_p}}, DataType::FLOAT, }; run_forward_test(input_shape, index_shape, output_shape); @@ -90,15 +90,15 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { SUBCASE("test gather backward, 2D") { TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{2_n, 100_n}}, + TensorDims{FFOrdered{2_p, 100_p}}, DataType::FLOAT, }; TensorShape index_shape = TensorShape{ - TensorDims{FFOrdered{2_n, 25_n}}, + TensorDims{FFOrdered{2_p, 25_p}}, DataType::INT32, }; TensorShape output_shape = TensorShape{ - TensorDims{FFOrdered{2_n, 25_n}}, + TensorDims{FFOrdered{2_p, 25_p}}, DataType::FLOAT, }; run_backward_test(input_shape, index_shape, output_shape); diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc index 283b465abc..94ce8f4848 100644 --- a/lib/kernels/test/src/test_partition_kernel.cc +++ b/lib/kernels/test/src/test_partition_kernel.cc @@ -19,7 +19,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { managed_handle.raw_handle(), DataType::FLOAT); TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{10_n, 10_n}}, + TensorDims{FFOrdered{10_p, 10_p}}, DataType::FLOAT, }; TensorShape output_shape = input_shape; diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index ceca1d94dd..7691daf7a6 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -6,20 +6,20 @@ using namespace ::FlexFlow; TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Pool2D Forward and Backward Kernel") { - nonnegative_int input_w = 10_n; - nonnegative_int input_h = 10_n; - nonnegative_int input_c = 3_n; - nonnegative_int input_n = 1_n; - nonnegative_int output_w = 5_n; - nonnegative_int output_h = 5_n; - nonnegative_int output_c = 3_n; - nonnegative_int output_n = 1_n; + positive_int input_w = 10_p; + positive_int input_h = 10_p; + positive_int input_c = 3_p; + positive_int input_n = 1_p; + positive_int output_w = 5_p; + positive_int output_h = 5_p; + positive_int output_c = 3_p; + positive_int output_n = 1_p; nonnegative_int pad_h = 0_n; nonnegative_int pad_w = 0_n; - nonnegative_int kernel_h = 2_n; - nonnegative_int kernel_w = 2_n; - nonnegative_int stride_h = 2_n; - nonnegative_int stride_w = 2_n; + positive_int kernel_h = 2_p; + positive_int kernel_w = 2_p; + positive_int stride_h = 2_p; + positive_int stride_w = 2_p; PoolOp pool_type = PoolOp::MAX; @@ -34,20 +34,20 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { Pool2DPerDeviceState state = Kernels::Pool2D::init_kernel(/*handle=*/managed_handle.raw_handle(), /*activation=*/std::nullopt, - /*input_w=*/input_w.unwrap_nonnegative(), - /*input_h=*/input_h.unwrap_nonnegative(), - /*input_c=*/input_c.unwrap_nonnegative(), - /*input_n=*/input_n.unwrap_nonnegative(), - /*output_w=*/output_w.unwrap_nonnegative(), - /*output_h=*/output_h.unwrap_nonnegative(), - /*output_c=*/output_c.unwrap_nonnegative(), - /*output_n=*/output_n.unwrap_nonnegative(), + /*input_w=*/input_w.int_from_positive_int(), + /*input_h=*/input_h.int_from_positive_int(), + /*input_c=*/input_c.int_from_positive_int(), + /*input_n=*/input_n.int_from_positive_int(), + /*output_w=*/output_w.int_from_positive_int(), + /*output_h=*/output_h.int_from_positive_int(), + /*output_c=*/output_c.int_from_positive_int(), + /*output_n=*/output_n.int_from_positive_int(), /*pad_h=*/pad_h.unwrap_nonnegative(), /*pad_w=*/pad_w.unwrap_nonnegative(), - /*kernel_h=*/kernel_h.unwrap_nonnegative(), - /*kernel_w=*/kernel_w.unwrap_nonnegative(), - /*stride_h=*/stride_h.unwrap_nonnegative(), - /*stride_w=*/stride_w.unwrap_nonnegative(), + /*kernel_h=*/kernel_h.int_from_positive_int(), + /*kernel_w=*/kernel_w.int_from_positive_int(), + /*stride_h=*/stride_h.int_from_positive_int(), + /*stride_w=*/stride_w.int_from_positive_int(), /*pool_type=*/pool_type); TensorShape input_shape = TensorShape{ diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc index b7990d84fa..16b03d34d9 100644 --- a/lib/kernels/test/src/test_reduction_kernel.cc +++ b/lib/kernels/test/src/test_reduction_kernel.cc @@ -9,7 +9,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { std::size_t num_replicas = 5; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{10_n, 10_n, 10_n, 10_n, 10_n}}, + TensorDims{FFOrdered{10_p, 10_p, 10_p, 10_p, 10_p}}, DataType::FLOAT, }; @@ -23,7 +23,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { SUBCASE("forward_kernel") { TensorShape output_shape = TensorShape{ - TensorDims{FFOrdered{10_n}}, + TensorDims{FFOrdered{10_p}}, DataType::FLOAT, }; diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index ceb0915c03..95989776c1 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -13,11 +13,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { nonnegative_int num_replicas = 10_n; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{3_n}}, + TensorDims{FFOrdered{3_p}}, DataType::FLOAT, }; TensorShape output_shape = TensorShape{ - TensorDims{FFOrdered{3_n}}, + TensorDims{FFOrdered{3_p}}, DataType::FLOAT, }; @@ -73,14 +73,14 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { } TEST_CASE("Check Replicate Forward and Backward Kernel against CPU Kernel") { - nonnegative_int num_replicas = 2_n; + positive_int num_replicas = 2_p; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{5_n}}, + TensorDims{FFOrdered{5_p}}, DataType::FLOAT, }; TensorShape output_shape = TensorShape{ - TensorDims{FFOrdered{5_n, num_replicas}}, + TensorDims{FFOrdered{5_p, num_replicas}}, DataType::FLOAT, }; @@ -129,7 +129,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), output_grad_accessor_gpu, input_grad_accessor_gpu, - num_replicas.unwrap_nonnegative()); + num_replicas.int_from_positive_int()); // Run CPU Replicate Backward Kernel GenericTensorAccessorR output_grad_accessor_cpu = @@ -140,7 +140,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { Kernels::Replicate::cpu_backward_kernel( output_grad_accessor_cpu, input_grad_accessor_cpu, - num_replicas.unwrap_nonnegative()); + num_replicas.int_from_positive_int()); CHECK_MESSAGE( accessors_are_equal(input_grad_accessor_gpu, input_grad_accessor_cpu), diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc index 69f0a1f214..8c851e877e 100644 --- a/lib/kernels/test/src/test_reshape_kernel.cc +++ b/lib/kernels/test/src/test_reshape_kernel.cc @@ -14,7 +14,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{100_n}}, + TensorDims{FFOrdered{100_p}}, DataType::FLOAT, }; TensorShape output_shape = input_shape; diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index f2ddb2c67b..b9f97bc5cd 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -8,7 +8,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Call Reverse Forward and Backward Kernels") { TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{1_n, 10_n, 10_n}}, + TensorDims{FFOrdered{1_p, 10_p, 10_p}}, DataType::FLOAT, }; TensorShape output_shape = input_shape; @@ -55,7 +55,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Check Reverse Forward and Backward Kernels against CPU Kernels") { TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{1_n, 4_n, 3_n}}, + TensorDims{FFOrdered{1_p, 4_p, 3_p}}, DataType::FLOAT, }; TensorShape output_shape = input_shape; diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index 0d5dcb79a2..dc8cb276ab 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -21,7 +21,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{100_n}}, + TensorDims{FFOrdered{100_p}}, DataType::FLOAT, }; TensorShape output_shape = input_shape; @@ -59,7 +59,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { managed_stream.raw_stream(), output_grad_accessor.get_float_ptr(), input_grad_accessor.get_float_ptr(), - output_grad_accessor.shape.num_elements().unwrap_nonnegative()); + output_grad_accessor.shape.num_elements().int_from_positive_int()); CHECK(contains_non_zero(input_grad_accessor)); } diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index d8ddb8c4b9..d51d0e40f5 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -22,11 +22,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{100_n}}, + TensorDims{FFOrdered{100_p}}, DataType::FLOAT, }; TensorShape output_shape = TensorShape{ - TensorDims{FFOrdered{50_n}}, + TensorDims{FFOrdered{50_p}}, DataType::FLOAT, }; diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index e2042c1e2c..06b5add3c7 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -21,7 +21,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{10_n, 10_n}}, + TensorDims{FFOrdered{10_p, 10_p}}, DataType::FLOAT, }; TensorShape output_shape = input_shape; diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc index 15ebdd5f28..974e580b8e 100644 --- a/lib/local-execution/src/loss_functions.cc +++ b/lib/local-execution/src/loss_functions.cc @@ -56,14 +56,14 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { auto logit = acc.get_tensor(LOGIT); auto label = acc.get_loss_tensor(LABEL); int batch_size = - logit.shape.at(legion_dim_t{nonnegative_int{1}}).unwrap_nonnegative(); + logit.shape.at(legion_dim_t{1_n}).int_from_positive_int(); // assuming logit shape is [batch dim, num classes] LossFunction loss_type = get_loss_function(attrs); float scale_factor = 1.0f / batch_size; if (loss_type == LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE) { - assert(logit.shape.get_volume() == label.shape.get_volume()); - scale_factor = 2.0f / logit.shape.get_volume().unwrap_nonnegative(); + ASSERT(logit.shape.num_elements() == label.shape.num_elements()); + scale_factor = 2.0f / logit.shape.num_elements().int_from_positive_int(); } if (loss_type == LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY) { @@ -71,27 +71,27 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { auto scce_attrs = attrs.get(); size_t ndim = logit.shape.num_dims().unwrap_nonnegative(); int num_classes = - logit.shape.at(legion_dim_t{nonnegative_int{0}}).unwrap_nonnegative(); - assert(logit_grad.shape == logit.shape); + logit.shape.at(legion_dim_t{0_n}).int_from_positive_int(); + ASSERT(logit_grad.shape == logit.shape); int k = 1; if (scce_attrs.replace_labels) { k = logit.shape.at(legion_dim_t(nonnegative_int{ndim - 1})) - .unwrap_nonnegative() / + .int_from_positive_int() / label.shape.at(legion_dim_t(nonnegative_int{ndim - 1})) - .unwrap_nonnegative(); // TODO FIXME something seems wrong here, + .int_from_positive_int(); // TODO FIXME something seems wrong here, // isn't the numerator guaranteed to be 1? // <--- this is not the case because of the // potential parallel dim } - assert( - label.shape.sub_shape(legion_dim_t(nonnegative_int{1}), std::nullopt) == - logit.shape.sub_shape(legion_dim_t(nonnegative_int{1}), std::nullopt)); - assert(k * label.shape.at(legion_dim_t(nonnegative_int{ndim - 1})) - .unwrap_nonnegative() == + ASSERT( + label.shape.sub_shape(legion_dim_t(1_n), std::nullopt) == + logit.shape.sub_shape(legion_dim_t(1_n), std::nullopt)); + ASSERT(k * label.shape.at(legion_dim_t(nonnegative_int{ndim - 1})) + .int_from_positive_int() == logit.shape.at(legion_dim_t(nonnegative_int{ndim - 1})) - .unwrap_nonnegative()); - assert( - label.shape.at(legion_dim_t(nonnegative_int{0})).unwrap_nonnegative() == + .int_from_positive_int()); + ASSERT( + label.shape.at(legion_dim_t(0_n)).int_from_positive_int() == 1); profile(sparse_categorical_crossentropy_loss_backward_kernel, @@ -100,17 +100,17 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { get_float_ptr(logit_grad), get_float_ptr(logit), reinterpret_cast(get_float_ptr(label)), - get_volume(logit.shape).unwrap_nonnegative(), - get_volume(logit_grad.shape).unwrap_nonnegative(), + get_num_elements(logit.shape).int_from_positive_int(), + get_num_elements(logit_grad.shape).int_from_positive_int(), batch_size, num_classes, k, scale_factor); } else { - assert(logit.shape == label.shape); - assert(logit_grad.shape == logit.shape); + ASSERT(logit.shape == label.shape); + ASSERT(logit_grad.shape == logit.shape); int num_channels = - logit.shape.at(legion_dim_t{nonnegative_int{0}}).unwrap_nonnegative(); + logit.shape.at(legion_dim_t{0_n}).int_from_positive_int(); switch (loss_type) { case LossFunction::CATEGORICAL_CROSSENTROPY: { profile(categorical_crossentropy_loss_backward_kernel, @@ -119,8 +119,8 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { get_float_ptr(logit_grad), get_float_ptr(logit), get_float_ptr(label), - get_volume(logit.shape).unwrap_nonnegative(), - get_volume(logit_grad.shape).unwrap_nonnegative(), + get_num_elements(logit.shape).int_from_positive_int(), + get_num_elements(logit_grad.shape).int_from_positive_int(), scale_factor); break; } @@ -131,8 +131,8 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { get_float_ptr(logit_grad), get_float_ptr(logit), get_float_ptr(label), - get_volume(logit.shape).unwrap_nonnegative(), - get_volume(logit_grad.shape).unwrap_nonnegative(), + get_num_elements(logit.shape).int_from_positive_int(), + get_num_elements(logit_grad.shape).int_from_positive_int(), scale_factor); break; } @@ -142,13 +142,13 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { "[IdentityLoss] backward_time = %.2lfms\n", get_float_ptr(logit_grad), get_float_ptr(logit), - get_volume(logit.shape).unwrap_nonnegative(), - get_volume(logit_grad.shape).unwrap_nonnegative(), + get_num_elements(logit.shape).int_from_positive_int(), + get_num_elements(logit_grad.shape).int_from_positive_int(), scale_factor); break; } default: - throw mk_runtime_error(fmt::format( + PANIC(fmt::format( "Unsupported loss function {}. Please report this as an issue.", loss_type)); } diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc index 1b8fc37b2d..1d65172e67 100644 --- a/lib/local-execution/src/optimizer.cc +++ b/lib/local-execution/src/optimizer.cc @@ -66,18 +66,18 @@ static void sgd_update_task_impl(TaskArgumentAccessor const &acc) { auto weight = acc.get_tensor(WEIGHT); auto profiling = acc.get_argument(PROFILING); - assert(weight.shape == weight_grad.shape); - int size = weight_grad.shape.get_volume().unwrap_nonnegative(); + ASSERT(weight.shape == weight_grad.shape); + int size = weight_grad.shape.num_elements().int_from_positive_int(); - assert(weight_grad.shape.get_volume().unwrap_nonnegative() & - weight.shape.get_volume().unwrap_nonnegative()); - int num_replicas = weight_grad.shape.get_volume().unwrap_nonnegative() / - weight.shape.get_volume().unwrap_nonnegative(); + ASSERT(weight_grad.shape.num_elements().int_from_positive_int() & + weight.shape.num_elements().int_from_positive_int()); + int num_replicas = weight_grad.shape.num_elements().int_from_positive_int() / + weight.shape.num_elements().int_from_positive_int(); float *sgd_v_ptr; if (attrs.momentum > 0.0f) { auto sgd_v = acc.get_optimizer_tensor(SGD_V); - assert(sgd_v.shape == weight.shape); + ASSERT(sgd_v.shape == weight.shape); sgd_v_ptr = sgd_v.get_float_ptr(); } @@ -180,14 +180,10 @@ static void adam_update_task_impl(TaskArgumentAccessor const &acc) { auto profiling = acc.get_argument(PROFILING); - assert(weight.shape == weight_grad.shape); - int size = weight_grad.shape.get_volume().unwrap_nonnegative(); + ASSERT(weight.shape == weight_grad.shape); + int size = weight_grad.shape.num_elements().int_from_positive_int(); - assert(weight_grad.shape.get_volume().unwrap_nonnegative() % - weight.shape.get_volume().unwrap_nonnegative() == - 0); - int num_replicas = weight_grad.shape.get_volume().unwrap_nonnegative() / - weight.shape.get_volume().unwrap_nonnegative(); + ASSERT(weight_grad.shape.num_elements() % weight.shape.num_elements() == 0); auto handle = acc.get_argument(HANDLE); profile(adam_nccl_update_task_gpu, @@ -198,9 +194,9 @@ static void adam_update_task_impl(TaskArgumentAccessor const &acc) { attrs.beta2, attrs.weight_decay, attrs.epsilon, - size, handle, weight_grad.get_float_ptr(), + size, m_tensor.get_float_ptr(), v_tensor.get_float_ptr(), weight.get_float_ptr()); // how to deal with removal of ParamSync? diff --git a/lib/local-execution/test/src/test_allocated_tensors.cc b/lib/local-execution/test/src/test_allocated_tensors.cc index 45fc8e0a1c..971b09356c 100644 --- a/lib/local-execution/test/src/test_allocated_tensors.cc +++ b/lib/local-execution/test/src/test_allocated_tensors.cc @@ -1,6 +1,6 @@ #include "local-execution/allocated_tensors.h" #include "local-execution/gradient_tensor_source.h" -#include "local-execution/local_cpu_allocator.h" +#include "kernels/local_cpu_allocator.h" #include "local-execution/loss_tensor_source.h" #include "local-execution/optimizer_tensor_source.h" #include "pcg/computation_graph.dtg.h" @@ -29,15 +29,15 @@ TEST_SUITE(FF_TEST_SUITE) { tensor_guid_t dangling_tensor = tensor_guid_source.new_mock_tensor_guid(); TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_n, 10_n}}, + TensorShape{TensorDims{FFOrdered{16_p, 10_p}}, DataType::FLOAT}, CreateGrad::NO}; TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_n, 20_n}}, + TensorShape{TensorDims{FFOrdered{16_p, 20_p}}, DataType::FLOAT}, CreateGrad::NO}; TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_n, 30_n}}, + TensorShape{TensorDims{FFOrdered{16_p, 30_p}}, DataType::FLOAT}, CreateGrad::YES}; diff --git a/lib/local-execution/test/src/test_e2e.cc b/lib/local-execution/test/src/test_e2e.cc index 6dabe09799..2494ff1943 100644 --- a/lib/local-execution/test/src/test_e2e.cc +++ b/lib/local-execution/test/src/test_e2e.cc @@ -1,8 +1,10 @@ +#include "kernels/compare_tensor_accessors.h" #include "kernels/copy_tensor_accessor.h" #include "kernels/local_cpu_allocator.h" #include "kernels/local_cuda_allocator.h" #include "kernels/managed_ff_stream.h" #include "kernels/managed_per_device_ff_handle.h" +#include "kernels/tensor_accessor_reductions.h" #include "local-execution/allocated_tensors.h" #include "local-execution/local_training_backing.h" #include "local-execution/model_training_instance.h" @@ -20,12 +22,9 @@ bool did_loss_decrease( GenericTensorAccessorR const &first_epoch, GenericTensorAccessorR const &last_epoch ) { - for (int i = 0; i < batch_size; i++) { - if (first_epoch[i] < last_epoch[i]) { - return false; - } - } - return true; + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + return tensor_accessor_all(compare_tensor_accessors_le(last_epoch, first_epoch, cpu_allocator)); } TEST_SUITE(FF_CUDA_TEST_SUITE) { @@ -43,13 +42,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { LossTensorSource loss_tensor_source; loss_tensor_t label_tensor = loss_tensor_source.new_loss_tensor(); - nonnegative_int batch_size = 10_n; - nonnegative_int data_dim = 16_n; - nonnegative_int hidden_dim = 32_n; - nonnegative_int output_dim = 1_n; + positive_int batch_size = 10_p; + positive_int data_dim = 16_p; + positive_int hidden_dim = 32_p; + positive_int output_dim = 1_p; TensorShape output_tensor_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, output_dim}}, + TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT}; GenericTensorAccessorW label_tensor_backing = @@ -66,14 +65,14 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { ComputationGraph computation_graph = make_empty_computation_graph(); TensorShape input_tensor_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, data_dim}}, + TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; TensorShape weight_shape_1 = TensorShape{ - TensorDims{FFOrdered{data_dim, hidden_dim}}, + TensorDims{FFOrdered{data_dim, hidden_dim}}, DataType::FLOAT}; TensorShape weight_shape_2 = TensorShape{ - TensorDims{FFOrdered{hidden_dim, output_dim}}, + TensorDims{FFOrdered{hidden_dim, output_dim}}, DataType::FLOAT}; LayerAddedResult inputs_layer = @@ -173,7 +172,6 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { // the first epoch GenericTensorAccessorR first_epoch_loss = loss_values.at(0); GenericTensorAccessorR last_epoch = loss_values.back(); - CHECK(did_loss_decrease( - first_epoch_loss, last_epoch, batch_size.unwrap_nonnegative())); + CHECK(did_loss_decrease( first_epoch_loss, last_epoch)); } } diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc index 4d015f4cfa..71148d06c1 100644 --- a/lib/local-execution/test/src/test_local_cost_estimator.cc +++ b/lib/local-execution/test/src/test_local_cost_estimator.cc @@ -9,7 +9,7 @@ using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("LocalCostEstimator") { ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( /*workSpaceSize=*/1024 * 1024, @@ -25,8 +25,8 @@ TEST_SUITE(FF_TEST_SUITE) { LocalCostEstimator cost_estimator = LocalCostEstimator{runtime_arg_config}; SUBCASE("Estimate cost -- Attention Op") { - nonnegative_int embed_dim = 32_n; - nonnegative_int num_heads = 10_n; + positive_int embed_dim = 32_p; + positive_int num_heads = 10_p; MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{ /*embed_dim=*/embed_dim, /*num_heads=*/num_heads, @@ -38,14 +38,14 @@ TEST_SUITE(FF_TEST_SUITE) { /*add_zero_attn=*/false, }; - nonnegative_int batch_size = 40_n; - nonnegative_int seq_len = 48_n; - nonnegative_int feature_size = 36_n; + positive_int batch_size = 40_p; + positive_int seq_len = 48_p; + positive_int feature_size = 36_p; DataType dtype = DataType::FLOAT; ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{ TensorDims{ - FFOrdered{batch_size, seq_len, feature_size}}, + FFOrdered{batch_size, seq_len, feature_size}}, DataType::FLOAT, }); @@ -68,7 +68,7 @@ TEST_SUITE(FF_TEST_SUITE) { make_1d_machine_view( MachineSpaceCoordinate{0_n, 0_n, DeviceType::GPU}, MachineSpecificationDimension::INTRA_NODE, - stride_t{0_n})); + stride_t{1_p})); CHECK(result.total_elapsed_time > 0); CHECK(result.total_mem_usage > 0); diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc deleted file mode 100644 index e55d1eddf5..0000000000 --- a/lib/local-execution/test/src/test_local_slots_backing.cc +++ /dev/null @@ -1,276 +0,0 @@ -#include "kernels/attention_kernels.h" -#include "kernels/local_cpu_allocator.h" -#include "local-execution/local_cost_estimator.h" -#include "local-execution/local_slots_backing.h" -#include "op-attrs/ops/attention.h" -#include "op-attrs/parallel_tensor_shape.h" -#include "pcg/computation_graph.h" -#include "pcg/computation_graph_builder.h" -#include "test/utils/doctest/fmt/pair.h" -#include "test/utils/doctest/fmt/unordered_map.h" -#include "test/utils/doctest/fmt/variant.h" -#include "test/utils/doctest/fmt/vector.h" -#include "test_utils.h" -#include - -using namespace ::FlexFlow; - -TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("LocalSlotsBacking -- Attention Op") { - // allocate input memory - Allocator allocator = create_local_cpu_memory_allocator(); - nonnegative_int embed_dim = 32_n; - nonnegative_int num_heads = 10_n; - - nonnegative_int batch_size = 40_n; - nonnegative_int seq_len = 48_n; - nonnegative_int feature_size = 36_n; - - DataType dtype = DataType::FLOAT; - TensorShape input_tensor_shape = TensorShape{ - TensorDims{ - FFOrdered{batch_size, seq_len, feature_size}}, - DataType::FLOAT, - }; - TensorShape query_shape = input_tensor_shape; - TensorShape key_shape = input_tensor_shape; - TensorShape value_shape = input_tensor_shape; - GenericTensorAccessorW query = allocator.allocate_tensor(query_shape); - GenericTensorAccessorW key = allocator.allocate_tensor(key_shape); - GenericTensorAccessorW value = allocator.allocate_tensor(value_shape); - - // build graph - ComputationGraphBuilder cg_builder; - tensor_guid_t query_guid = - cg_builder.create_input(query_shape, CreateGrad::YES); - tensor_guid_t key_guid = - cg_builder.create_input(key_shape, CreateGrad::YES); - tensor_guid_t value_guid = - cg_builder.create_input(value_shape, CreateGrad::YES); - - std::string layer_name = "attn1"; - tensor_guid_t output_guid = - cg_builder.multihead_attention(query_guid, - key_guid, - value_guid, - embed_dim, - num_heads, - /*kdim=*/embed_dim, - /*vdim=*/embed_dim, - /*dropout=*/0.0f, - /*bias=*/true, - /*add_bias_kv=*/false, - /*add_zero_attn=*/false, - /*initializer=*/std::nullopt, - /*maybe_name=*/layer_name); - - layer_guid_t layer_guid = - get_layer_by_name(cg_builder.computation_graph, layer_name); - - TensorBackingMap tensor_backing_map = { - {query_guid, query}, {key_guid, key}, {value_guid, value}}; - - // runtime arg config - ProfilingSettings settings = ProfilingSettings{/*warmup_iters=*/0, - /*measure_iters=*/0}; - PerDeviceFFHandle handle = get_mock_per_device_ff_handle(); - RuntimeArgConfig runtime_arg_config = - RuntimeArgConfig{DeviceSpecific::create(handle), - EnableProfiling::NO, - settings}; - - LocalSlotsBacking local_slots_backing = {tensor_backing_map, - runtime_arg_config}; - - SUBCASE("LocalSlotsBacking::allocate_outgoing_tensors") { - auto get_result_shape_and_dtype_for_tensor_guid_and_map = - [&](tensor_guid_t t, - TensorBackingMap m) -> std::pair { - GenericTensorAccessorW accessor = m.at(t); - return get_shape_and_datatype(accessor); - }; - - SUBCASE("Input (QKV) and gradient tensors allocation") { - - // allocate all tensors from input nodes - for (layer_guid_t const &node : - topological_ordering(cg_builder.computation_graph)) { - if (node == layer_guid) { - break; - } - local_slots_backing.allocate_outgoing_tensors( - node, cg_builder.computation_graph, allocator); - } - - SUBCASE("Query grad") { - std::pair result = - get_result_shape_and_dtype_for_tensor_guid_and_map( - query_guid, local_slots_backing.gradient_tensor_mapping); - std::pair correct = { - array_shape_from_tensor_shape(query_shape), dtype}; - CHECK(result == correct); - } - SUBCASE("Key grad") { - std::pair result = - get_result_shape_and_dtype_for_tensor_guid_and_map( - key_guid, local_slots_backing.gradient_tensor_mapping); - std::pair correct = { - array_shape_from_tensor_shape(key_shape), dtype}; - CHECK(result == correct); - } - SUBCASE("Value grad") { - std::pair result = - get_result_shape_and_dtype_for_tensor_guid_and_map( - value_guid, local_slots_backing.gradient_tensor_mapping); - std::pair correct = { - array_shape_from_tensor_shape(value_shape), dtype}; - CHECK(result == correct); - } - } - SUBCASE("Output and gradient tensors allocation") { - local_slots_backing.allocate_outgoing_tensors( - layer_guid, cg_builder.computation_graph, allocator); - SUBCASE("Output") { - std::pair result = - get_result_shape_and_dtype_for_tensor_guid_and_map( - output_guid, local_slots_backing.tensor_mapping); - std::pair correct = { - array_shape_from_tensor_shape( - get_tensor_attrs(cg_builder.computation_graph, output_guid) - .shape), - dtype}; - CHECK(result == correct); - } - SUBCASE("Output grad") { - std::pair result = - get_result_shape_and_dtype_for_tensor_guid_and_map( - output_guid, local_slots_backing.gradient_tensor_mapping); - std::pair correct = { - array_shape_from_tensor_shape( - get_tensor_attrs(cg_builder.computation_graph, output_guid) - .shape), - dtype}; - CHECK(result == correct); - } - } - - SUBCASE("Tensor slots") { - local_slots_backing.allocate_outgoing_tensors( - layer_guid, cg_builder.computation_graph, allocator); - SUBCASE("Input tensor slots") { - std::vector correct_incoming_tensors = - get_incoming_tensors(cg_builder.computation_graph, layer_guid); - CHECK(correct_incoming_tensors == - local_slots_backing.input_tensor_slots.at(layer_guid)); - } - SUBCASE("Output tensor slots") { - std::vector correct_outgoing_tensors = - get_outgoing_tensors(cg_builder.computation_graph, layer_guid); - CHECK(correct_outgoing_tensors == - local_slots_backing.output_tensor_slots.at(layer_guid)); - } - } - } - - SUBCASE("Construct Slots Backings") { - enum Slots { - QUERY, - KEY, - VALUE, - WEIGHTS, - OUTPUT, - QUERY_PARALLEL_TENSOR_SHAPE, - QPROJSIZE, - ATTRS, - PROFILING, - HANDLE, - }; - MultiHeadAttentionAttrs attrs = - get_layer_attrs(cg_builder.computation_graph, layer_guid) - .op_attrs.get(); - OpTaskBinding binding = [&] { - OpTaskBinding b; - b.bind(QUERY, input_tensor(0)); - b.bind(KEY, input_tensor(1)); - b.bind(VALUE, input_tensor(2)); - b.bind(WEIGHTS, weight_tensor(3)); - b.bind(OUTPUT, output_tensor(0)); - - b.bind_grad(QUERY, input_tensor(0)); - - b.bind_arg(QPROJSIZE, get_qProjSize(attrs)); - b.bind_arg(ATTRS, attrs); - b.bind_arg(QUERY_PARALLEL_TENSOR_SHAPE, input_parallel_tensor_shape(0)); - b.bind_arg(PROFILING, profiling_settings()); - b.bind_arg(HANDLE, ff_handle()); - return b; - }(); - - // allocate all incoming and outgoing tensors for graph - for (layer_guid_t const &node : - topological_ordering(cg_builder.computation_graph)) { - local_slots_backing.allocate_outgoing_tensors( - node, cg_builder.computation_graph, allocator); - } - - SUBCASE("LocalSlotsBacking::construct_tensor_slots_backing") { - TensorSlotsBackingWithoutAddresses result = - get_slots_backing_without_tensor_allocation_addresses( - local_slots_backing.construct_tensor_slots_backing(binding, - layer_guid)); - TensorSlotsBackingWithoutAddresses correct = [&] { - TensorShape weights_shape = throw_if_unexpected( - get_weights_shape(attrs, query_shape, key_shape, value_shape)); - GenericTensorAccessorW weights = - allocator.allocate_tensor(weights_shape); - - TensorAttrs output_attrs = - get_tensor_attrs(cg_builder.computation_graph, output_guid); - GenericTensorAccessorW output = - allocator.allocate_tensor(output_attrs.shape); - return get_slots_backing_without_tensor_allocation_addresses( - TensorSlotsBacking{ - {SlotGradId{slot_id_t{QUERY}, IsGrad::NO}, query}, - {SlotGradId{slot_id_t{KEY}, IsGrad::NO}, key}, - {SlotGradId{slot_id_t{VALUE}, IsGrad::NO}, value}, - {SlotGradId{slot_id_t{WEIGHTS}, IsGrad::NO}, weights}, - {SlotGradId{slot_id_t{OUTPUT}, IsGrad::NO}, output}, - {SlotGradId{slot_id_t{QUERY}, IsGrad::YES}, query}}); - }(); - - CHECK(result == correct); - } - SUBCASE("LocalSlotsBacking::construct_arg_slots_backing") { - ArgSlotsBacking result = - local_slots_backing.construct_arg_slots_backing(binding, - layer_guid); - - ArgSlotsBacking correct = [&] { - ParallelTensorShape query_parallel_tensor_shape = - lift_to_parallel(query_shape); - - return ArgSlotsBacking{ - {slot_id_t{QPROJSIZE}, - ConcreteArgSpec::create(get_qProjSize(attrs))}, - {slot_id_t{ATTRS}, ConcreteArgSpec::create(attrs)}, - {slot_id_t{QUERY_PARALLEL_TENSOR_SHAPE}, - ConcreteArgSpec::create(query_parallel_tensor_shape)}, - {slot_id_t{PROFILING}, - ConcreteArgSpec::create(runtime_arg_config.profiling_settings)}, - {slot_id_t{HANDLE}, ConcreteArgSpec::create(handle)}}; - }(); - - CHECK(result == correct); - } - - SUBCASE("LocalSlotsBacking::resolve_runtime_arg_ref_spec") { - RuntimeArgRefSpec ref_spec = RuntimeArgRefSpec::create(ff_handle()); - ConcreteArgSpec arg_spec = - local_slots_backing.resolve_runtime_arg_ref_spec(ref_spec); - - PerDeviceFFHandle result_handle = arg_spec.get(); - CHECK(result_handle == handle); - } - } - } -} diff --git a/lib/local-execution/test/src/test_local_task_arg_accessor.cc b/lib/local-execution/test/src/test_local_task_arg_accessor.cc index 29b3b432cd..e817b6fd8e 100644 --- a/lib/local-execution/test/src/test_local_task_arg_accessor.cc +++ b/lib/local-execution/test/src/test_local_task_arg_accessor.cc @@ -9,17 +9,17 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("LocalTaskArgumentAccessor") { Allocator allocator = create_local_cpu_memory_allocator(); - nonnegative_int embed_dim = 32_n; - nonnegative_int num_heads = 10_n; + positive_int embed_dim = 32_p; + positive_int num_heads = 10_p; - nonnegative_int batch_size = 40_n; - nonnegative_int seq_len = 48_n; - nonnegative_int feature_size = 36_n; + positive_int batch_size = 40_p; + positive_int seq_len = 48_p; + positive_int feature_size = 36_p; DataType dtype = DataType::FLOAT; TensorShape input_tensor_shape = TensorShape{ TensorDims{ - FFOrdered{batch_size, seq_len, feature_size}}, + FFOrdered{batch_size, seq_len, feature_size}}, DataType::FLOAT, }; diff --git a/lib/local-execution/test/src/test_local_tensor_backing.cc b/lib/local-execution/test/src/test_local_tensor_backing.cc index 594051c2f1..df787fcd6f 100644 --- a/lib/local-execution/test/src/test_local_tensor_backing.cc +++ b/lib/local-execution/test/src/test_local_tensor_backing.cc @@ -1,4 +1,4 @@ -#include "local-execution/local_cpu_allocator.h" +#include "kernels/local_cpu_allocator.h" #include "local-execution/local_tensor_backing.h" #include "test_utils.h" #include "utils/containers/keys.h" @@ -94,11 +94,11 @@ TEST_SUITE(FF_TEST_SUITE) { tensor_guid_source.new_mock_tensor_guid(); TensorAttrs allocated_tensor_attrs = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_n, 10_n}}, + TensorShape{TensorDims{FFOrdered{16_p, 10_p}}, DataType::FLOAT}, CreateGrad::NO}; TensorAttrs unallocated_tensor_attrs = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_n, 20_n}}, + TensorShape{TensorDims{FFOrdered{16_p, 20_p}}, DataType::FLOAT}, CreateGrad::YES}; diff --git a/lib/local-execution/test/src/test_loss_functions.cc b/lib/local-execution/test/src/test_loss_functions.cc index e8f48413b6..5a9347e37b 100644 --- a/lib/local-execution/test/src/test_loss_functions.cc +++ b/lib/local-execution/test/src/test_loss_functions.cc @@ -13,7 +13,7 @@ using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("LossFunctions") { // initialize runtime ManagedFFStream managed_stream{}; @@ -31,15 +31,15 @@ TEST_SUITE(FF_TEST_SUITE) { loss_tensor_t label_for_sparse_cce_loss_attrs = loss_tensor_source.new_loss_tensor(); - nonnegative_int batch_size = 10_n; - nonnegative_int data_dim = 16_n; - nonnegative_int output_dim = 32_n; + positive_int batch_size = 10_p; + positive_int data_dim = 16_p; + positive_int output_dim = 32_p; TensorShape output_tensor_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, output_dim}}, + TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT}; TensorShape reduced_tensor_shape = - TensorShape{TensorDims{FFOrdered{batch_size, 1_n}}, + TensorShape{TensorDims{FFOrdered{batch_size, 1_p}}, DataType::FLOAT}; GenericTensorAccessorW label_for_nonconfigurable_loss_attrs_backing = @@ -58,11 +58,11 @@ TEST_SUITE(FF_TEST_SUITE) { ComputationGraph computation_graph = make_empty_computation_graph(); TensorShape input_tensor_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, data_dim}}, + TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; TensorShape weight_shape = TensorShape{ - TensorDims{FFOrdered{data_dim, output_dim}}, + TensorDims{FFOrdered{data_dim, output_dim}}, DataType::FLOAT}; LayerAddedResult inputs_layer = diff --git a/lib/local-execution/test/src/test_task_registry.cc b/lib/local-execution/test/src/test_task_registry.cc index c87fd3a899..ea20eb0fa0 100644 --- a/lib/local-execution/test/src/test_task_registry.cc +++ b/lib/local-execution/test/src/test_task_registry.cc @@ -12,8 +12,8 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("TaskRegistry") { layer_guid_t layer_guid = layer_guid_t{Node{0}}; - nonnegative_int embed_dim = 32_n; - nonnegative_int num_heads = 10_n; + positive_int embed_dim = 32_p; + positive_int num_heads = 10_p; ComputationGraphOpAttrs attrs = ComputationGraphOpAttrs{MultiHeadAttentionAttrs{ /*embed_dim=*/embed_dim, @@ -80,7 +80,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("different attrs, still same task fn mapping") { layer_guid_t layer_1 = layer_guid_t{Node{1}}; - nonnegative_int embed_dim = 100_n; + positive_int embed_dim = 100_p; layer_guid_t layer_2 = layer_guid_t{Node{2}}; ComputationGraphOpAttrs other_attrs = ComputationGraphOpAttrs{MultiHeadAttentionAttrs{ @@ -112,7 +112,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("equality") { SUBCASE("different attrs is still equal") { - nonnegative_int embed_dim = 100_n; + positive_int embed_dim = 100_p; ComputationGraphOpAttrs other_attrs = ComputationGraphOpAttrs{MultiHeadAttentionAttrs{ /*embed_dim=*/embed_dim, diff --git a/lib/local-execution/test/src/test_unallocated_tensors.cc b/lib/local-execution/test/src/test_unallocated_tensors.cc index 82f5a132fe..7a2650b447 100644 --- a/lib/local-execution/test/src/test_unallocated_tensors.cc +++ b/lib/local-execution/test/src/test_unallocated_tensors.cc @@ -1,6 +1,6 @@ #include "local-execution/allocated_tensors.h" #include "local-execution/gradient_tensor_source.h" -#include "local-execution/local_cpu_allocator.h" +#include "kernels/local_cpu_allocator.h" #include "local-execution/loss_tensor_source.h" #include "local-execution/optimizer_tensor_source.h" #include "local-execution/unallocated_tensors.h" @@ -38,15 +38,15 @@ TEST_SUITE(FF_TEST_SUITE) { optimizer_tensor_source.new_optimizer_tensor(); TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_n, 10_n}}, + TensorShape{TensorDims{FFOrdered{16_p, 10_p}}, DataType::FLOAT}, CreateGrad::NO}; TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_n, 20_n}}, + TensorShape{TensorDims{FFOrdered{16_p, 20_p}}, DataType::FLOAT}, CreateGrad::NO}; TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_n, 30_n}}, + TensorShape{TensorDims{FFOrdered{16_p, 30_p}}, DataType::FLOAT}, CreateGrad::YES}; diff --git a/lib/local-execution/test/src/test_update.cc b/lib/local-execution/test/src/test_update.cc index 18509d1fd9..6ffe002f22 100644 --- a/lib/local-execution/test/src/test_update.cc +++ b/lib/local-execution/test/src/test_update.cc @@ -11,7 +11,7 @@ using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("ExecuteUpdate") { // initialize runtime configs ManagedFFStream managed_stream{}; @@ -26,16 +26,16 @@ TEST_SUITE(FF_TEST_SUITE) { // construct computation graph ComputationGraph computation_graph = make_empty_computation_graph(); - nonnegative_int batch_size = 10_n; - nonnegative_int data_dim = 16_n; - nonnegative_int output_dim = 32_n; + positive_int batch_size = 10_p; + positive_int data_dim = 16_p; + positive_int output_dim = 32_p; TensorShape input_tensor_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, data_dim}}, + TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; TensorShape weight_shape = TensorShape{ - TensorDims{FFOrdered{data_dim, output_dim}}, + TensorDims{FFOrdered{data_dim, output_dim}}, DataType::FLOAT}; LayerAddedResult inputs_layer = diff --git a/lib/models/include/models/bert/bert_config.struct.toml b/lib/models/include/models/bert/bert_config.struct.toml index cc2a8eb0a7..de56a25710 100644 --- a/lib/models/include/models/bert/bert_config.struct.toml +++ b/lib/models/include/models/bert/bert_config.struct.toml @@ -12,28 +12,28 @@ features = [ includes = [ "op-attrs/activation.dtg.h", - "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", ] [[fields]] name = "vocab_size" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "hidden_size" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "num_encoder_layers" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "num_heads" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "dim_feedforward" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "hidden_act" @@ -65,8 +65,8 @@ type = "float" [[fields]] name = "sequence_length" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "batch_size" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" diff --git a/lib/models/include/models/candle_uno/candle_uno_config.struct.toml b/lib/models/include/models/candle_uno/candle_uno_config.struct.toml index e7d83efd07..135c58e1cc 100644 --- a/lib/models/include/models/candle_uno/candle_uno_config.struct.toml +++ b/lib/models/include/models/candle_uno/candle_uno_config.struct.toml @@ -14,7 +14,7 @@ includes = [ "", "", "", - "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", ] src_includes = [ @@ -26,19 +26,19 @@ src_includes = [ [[fields]] name = "batch_size" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "dense_layers" -type = "std::vector<::FlexFlow::nonnegative_int>" +type = "std::vector<::FlexFlow::positive_int>" [[fields]] name = "dense_feature_layers" -type = "std::vector<::FlexFlow::nonnegative_int>" +type = "std::vector<::FlexFlow::positive_int>" [[fields]] name = "feature_shapes" -type = "std::map" +type = "std::map" [[fields]] name = "input_features" diff --git a/lib/models/include/models/dlrm/dlrm_config.struct.toml b/lib/models/include/models/dlrm/dlrm_config.struct.toml index 5f1c38faae..3cf43aed48 100644 --- a/lib/models/include/models/dlrm/dlrm_config.struct.toml +++ b/lib/models/include/models/dlrm/dlrm_config.struct.toml @@ -14,7 +14,7 @@ includes = [ "", "", "models/dlrm/dlrm_arch_interaction_op.dtg.h", - "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", ] src_includes = [ @@ -24,23 +24,23 @@ src_includes = [ [[fields]] name = "embedding_dim" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "embedding_bag_size" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "embedding_size" -type = "std::vector<::FlexFlow::nonnegative_int>" +type = "std::vector<::FlexFlow::positive_int>" [[fields]] name = "dense_arch_layer_sizes" -type = "std::vector<::FlexFlow::nonnegative_int>" +type = "std::vector<::FlexFlow::positive_int>" [[fields]] name = "over_arch_layer_sizes" -type = "std::vector<::FlexFlow::nonnegative_int>" +type = "std::vector<::FlexFlow::positive_int>" [[fields]] name = "arch_interaction_op" @@ -48,7 +48,7 @@ type = "::FlexFlow::DLRMArchInteractionOp" [[fields]] name = "batch_size" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "seed" diff --git a/lib/models/include/models/inception_v3/inception_v3_config.struct.toml b/lib/models/include/models/inception_v3/inception_v3_config.struct.toml index 1290420e16..0075783c87 100644 --- a/lib/models/include/models/inception_v3/inception_v3_config.struct.toml +++ b/lib/models/include/models/inception_v3/inception_v3_config.struct.toml @@ -11,16 +11,16 @@ features = [ ] includes = [ - "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", ] [[fields]] name = "num_classes" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "batch_size" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "aux_logits" diff --git a/lib/models/include/models/split_test/split_test.h b/lib/models/include/models/split_test/split_test.h index dd7089c4f6..d5de538b8b 100644 --- a/lib/models/include/models/split_test/split_test.h +++ b/lib/models/include/models/split_test/split_test.h @@ -12,7 +12,7 @@ namespace FlexFlow { * @note This is a tiny model developed for testing the original Unity * implementation. It is not a "real" model and has never been trained. */ -ComputationGraph get_split_test_computation_graph(nonnegative_int batch_size); +ComputationGraph get_split_test_computation_graph(positive_int batch_size); } // namespace FlexFlow diff --git a/lib/models/include/models/transformer/transformer_config.struct.toml b/lib/models/include/models/transformer/transformer_config.struct.toml index 2a0b39feb9..686491eff4 100644 --- a/lib/models/include/models/transformer/transformer_config.struct.toml +++ b/lib/models/include/models/transformer/transformer_config.struct.toml @@ -10,36 +10,36 @@ features = [ ] includes = [ - "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", ] [[fields]] name = "num_features" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "sequence_length" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "batch_size" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "dim_feedforward" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "num_heads" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "num_encoder_layers" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "num_decoder_layers" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "dropout" @@ -51,4 +51,4 @@ type = "float" [[fields]] name = "vocab_size" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" diff --git a/lib/models/src/models/bert/bert.cc b/lib/models/src/models/bert/bert.cc index 535e03e413..bfcab8ffbf 100644 --- a/lib/models/src/models/bert/bert.cc +++ b/lib/models/src/models/bert/bert.cc @@ -7,11 +7,11 @@ namespace FlexFlow { BertConfig get_default_bert_config() { return BertConfig{ - /*vocab_size=*/30522_n, - /*hidden_size=*/768_n, - /*num_encoder_layers=*/12_n, - /*num_heads=*/12_n, - /*dim_feedforward=*/3072_n, + /*vocab_size=*/30522_p, + /*hidden_size=*/768_p, + /*num_encoder_layers=*/12_p, + /*num_heads=*/12_p, + /*dim_feedforward=*/3072_p, /*hidden_act=*/Activation::GELU, /*hidden_dropout_prob=*/0.1, /*attention_probs_dropout_prob=*/0.1, @@ -19,8 +19,8 @@ BertConfig get_default_bert_config() { /*layer_norm_eps=*/1e-12, /*position_embedding_type=*/"absolute", /*classifier_dropout=*/0.1, - /*sequence_length=*/512_n, - /*batch_size=*/64_n, + /*sequence_length=*/512_p, + /*batch_size=*/64_p, }; } @@ -60,8 +60,8 @@ tensor_guid_t assert(num_dims(cgb.get_shape(input)) == 3); std::vector layer_norm_axis = { relative_ff_dim_t{-1}}; // Apply layernorm across the last dim - nonnegative_int kdim = config.dim_feedforward / config.num_heads; - nonnegative_int vdim = config.dim_feedforward / config.num_heads; + positive_int kdim = positive_int{config.dim_feedforward / config.num_heads}; + positive_int vdim = positive_int{config.dim_feedforward / config.num_heads}; tensor_guid_t self_attention = cgb.multihead_attention(input, input, @@ -130,7 +130,7 @@ ComputationGraph get_bert_computation_graph(BertConfig const &config) { InitializerAttrs bias_initializer = InitializerAttrs{ZeroInitializerAttrs{}}; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ config.batch_size, config.sequence_length, config.hidden_size}}, DataType::FLOAT, }; @@ -152,7 +152,7 @@ ComputationGraph get_bert_computation_graph(BertConfig const &config) { assert( (cgb.get_shape(out_prob) == TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ config.batch_size, config.sequence_length, config.vocab_size}}, DataType::FLOAT, })); diff --git a/lib/models/src/models/candle_uno/candle_uno.cc b/lib/models/src/models/candle_uno/candle_uno.cc index 3d06b03348..8bbbccdbaf 100644 --- a/lib/models/src/models/candle_uno/candle_uno.cc +++ b/lib/models/src/models/candle_uno/candle_uno.cc @@ -6,16 +6,16 @@ namespace FlexFlow { CandleUnoConfig get_default_candle_uno_config() { return CandleUnoConfig{ - /*batch_size=*/64_n, - /*dense_layers=*/repeat_element(/*num_times=*/4_n, /*element=*/4192_n), + /*batch_size=*/64_p, + /*dense_layers=*/repeat_element(/*num_times=*/4_n, /*element=*/4192_p), /*dense_feature_layers=*/ - repeat_element(/*num_times=*/8_n, /*element=*/4192_n), + repeat_element(/*num_times=*/8_n, /*element=*/4192_p), /*feature_shapes=*/ { - {"dose", 1_n}, - {"cell.rnaseq", 942_n}, - {"drug.descriptors", 5270_n}, - {"drug.fingerprints", 2048_n}, + {"dose", 1_p}, + {"cell.rnaseq", 942_p}, + {"drug.descriptors", 5270_p}, + {"drug.fingerprints", 2048_p}, }, /*input_features=*/ { @@ -37,7 +37,7 @@ tensor_guid_t create_candle_uno_feature_model( tensor_guid_t const &input, InitializerAttrs const &kernel_initializer) { tensor_guid_t t = input; - for (nonnegative_int dense_dim : config.dense_feature_layers) { + for (positive_int dense_dim : config.dense_feature_layers) { t = cgb.dense(t, dense_dim, Activation::RELU, @@ -58,7 +58,7 @@ ComputationGraph InitializerAttrs{GlorotNormalAttrs{/*seed=*/0}}; auto create_input_tensor = - [&](FFOrdered const &dims) -> tensor_guid_t { + [&](FFOrdered const &dims) -> tensor_guid_t { TensorShape input_shape = TensorShape{ TensorDims{dims}, DataType::FLOAT, @@ -84,7 +84,7 @@ ComputationGraph for (auto const &input_feature : config.input_features) { std::string const &feature_name = input_feature.second; - nonnegative_int shape = config.feature_shapes.at(feature_name); + positive_int shape = config.feature_shapes.at(feature_name); tensor_guid_t input = create_input_tensor({config.batch_size, shape}); all_inputs.push_back(input); @@ -98,7 +98,7 @@ ComputationGraph tensor_guid_t output = cgb.concat(encoded_inputs, /*axis=*/relative_ff_dim_t{1}); - for (nonnegative_int dense_layer_dim : config.dense_layers) { + for (positive_int dense_layer_dim : config.dense_layers) { tensor_guid_t residual_input = output; output = cgb.dense(output, dense_layer_dim, @@ -114,7 +114,7 @@ ComputationGraph } } output = cgb.dense(output, - /*outDim=*/1_n, + /*outDim=*/1_p, /*activation=*/std::nullopt, /*use_bias=*/false, /*data_type=*/DataType::FLOAT, diff --git a/lib/models/src/models/dlrm/dlrm.cc b/lib/models/src/models/dlrm/dlrm.cc index 718e709352..5d56909fec 100644 --- a/lib/models/src/models/dlrm/dlrm.cc +++ b/lib/models/src/models/dlrm/dlrm.cc @@ -10,37 +10,37 @@ namespace FlexFlow { DLRMConfig get_default_dlrm_config() { return DLRMConfig{ - /*embedding_dim=*/64_n, - /*embedding_bag_size=*/1_n, + /*embedding_dim=*/64_p, + /*embedding_bag_size=*/1_p, /*embedding_size=*/ - std::vector{ - 1000000_n, - 1000000_n, - 1000000_n, - 1000000_n, + std::vector{ + 1000000_p, + 1000000_p, + 1000000_p, + 1000000_p, }, /*dense_arch_layer_sizes=*/ - std::vector{ - 4_n, - 64_n, - 64_n, + std::vector{ + 4_p, + 64_p, + 64_p, }, /*over_arch_layer_sizes=*/ - std::vector{ - 64_n, - 64_n, - 2_n, + std::vector{ + 64_p, + 64_p, + 2_p, }, /*arch_interaction_op=*/DLRMArchInteractionOp::CAT, - /*batch_size=*/64_n, - /*seed=*/std::rand(), + /*batch_size=*/64_p, + /*seed=*/0, }; } tensor_guid_t create_dlrm_mlp(ComputationGraphBuilder &cgb, DLRMConfig const &config, tensor_guid_t const &input, - std::vector const &mlp_layers) { + std::vector const &mlp_layers) { tensor_guid_t t = input; // Refer to @@ -76,8 +76,8 @@ tensor_guid_t create_dlrm_mlp(ComputationGraphBuilder &cgb, tensor_guid_t create_dlrm_sparse_embedding_network(ComputationGraphBuilder &cgb, DLRMConfig const &config, tensor_guid_t const &input, - nonnegative_int input_dim, - nonnegative_int output_dim) { + positive_int input_dim, + positive_int output_dim) { float range = sqrt(1.0f / input_dim); InitializerAttrs embed_initializer = InitializerAttrs{UniformInitializerAttrs{ /*seed=*/config.seed, @@ -116,7 +116,7 @@ tensor_guid_t create_dlrm_interact_features( ComputationGraph get_dlrm_computation_graph(DLRMConfig const &config) { ComputationGraphBuilder cgb; - auto create_input_tensor = [&](FFOrdered const &dims, + auto create_input_tensor = [&](FFOrdered const &dims, DataType const &data_type) -> tensor_guid_t { TensorShape input_shape = TensorShape{ TensorDims{dims}, @@ -145,7 +145,7 @@ ComputationGraph get_dlrm_computation_graph(DLRMConfig const &config) { std::vector emb_outputs = transform( zip(config.embedding_size, sparse_inputs), - [&](std::pair const &combined_pair) + [&](std::pair const &combined_pair) -> tensor_guid_t { return create_dlrm_sparse_embedding_network( /*cgb=*/cgb, diff --git a/lib/models/src/models/inception_v3/inception_v3.cc b/lib/models/src/models/inception_v3/inception_v3.cc index 3a829f3754..82aa445f17 100644 --- a/lib/models/src/models/inception_v3/inception_v3.cc +++ b/lib/models/src/models/inception_v3/inception_v3.cc @@ -16,12 +16,12 @@ struct CheckShape { InceptionV3Config const &config; void operator()(tensor_guid_t t, - nonnegative_int c, - nonnegative_int h, - nonnegative_int w) const { + positive_int c, + positive_int h, + positive_int w) const { TensorShape current_shape = cgb.get_shape(t); TensorShape expected_shape = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ config.batch_size, c, h, @@ -38,10 +38,10 @@ struct CheckShape { } } - void operator()(tensor_guid_t t, nonnegative_int c) const { + void operator()(tensor_guid_t t, positive_int c) const { TensorShape current_shape = cgb.get_shape(t); TensorShape expected_shape = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ config.batch_size, c, }}, @@ -59,11 +59,11 @@ struct CheckShape { InceptionV3Config get_default_inception_v3_training_config() { return InceptionV3Config{ - /*num_classes=*/1000_n, + /*num_classes=*/1000_p, // see section 8 of https://arxiv.org/abs/1512.00567 for the source of the // batch size - /*batch_size=*/32_n, + /*batch_size=*/32_p, // see section 4 of https://arxiv.org/abs/1512.00567 for a discussion of // auxiliary logits. they are used by default in training @@ -73,11 +73,11 @@ InceptionV3Config get_default_inception_v3_training_config() { static tensor_guid_t create_conv_block(ComputationGraphBuilder &cgb, tensor_guid_t const &input, - nonnegative_int filters, - nonnegative_int kernel_size_h, - nonnegative_int kernel_size_w, - nonnegative_int stride_h = 1_n, - nonnegative_int stride_w = 1_n, + positive_int filters, + positive_int kernel_size_h, + positive_int kernel_size_w, + positive_int stride_h = 1_p, + positive_int stride_w = 1_p, nonnegative_int padding_h = 0_n, nonnegative_int padding_w = 0_n, bool use_bias = false) { @@ -90,7 +90,7 @@ static tensor_guid_t create_conv_block(ComputationGraphBuilder &cgb, /*paddingH=*/padding_h, /*paddingW=*/padding_w, /*activation=*/std::nullopt, - /*groups=*/1_n, + /*groups=*/1_p, /*use_bias=*/use_bias); return cgb.batch_norm(conv, /*affine=*/true, @@ -101,27 +101,27 @@ static tensor_guid_t create_conv_block(ComputationGraphBuilder &cgb, static tensor_guid_t create_inception_module_a(ComputationGraphBuilder &cgb, tensor_guid_t const &input, - nonnegative_int pool_features) { + positive_int pool_features) { tensor_guid_t branch1x1 = create_conv_block(cgb, input, - /*filters=*/64_n, - /*kernel_size_h=*/1_n, - /*kernel_size_w=*/1_n); + /*filters=*/64_p, + /*kernel_size_h=*/1_p, + /*kernel_size_w=*/1_p); tensor_guid_t branch5x5 = [&] { tensor_guid_t t = input; t = create_conv_block(cgb, t, - /*filters=*/48_n, - /*kernel_size_h=*/1_n, - /*kernel_size_w=*/1_n); + /*filters=*/48_p, + /*kernel_size_h=*/1_p, + /*kernel_size_w=*/1_p); t = create_conv_block(cgb, t, - /*filters=*/64_n, - /*kernel_size_h=*/5_n, - /*kernel_size_w=*/5_n, - /*stride_h=*/1_n, - /*stride_w=*/1_n, + /*filters=*/64_p, + /*kernel_size_h=*/5_p, + /*kernel_size_w=*/5_p, + /*stride_h=*/1_p, + /*stride_w=*/1_p, /*padding_h=*/2_n, /*padding_w=*/2_n); return t; @@ -131,25 +131,25 @@ static tensor_guid_t create_inception_module_a(ComputationGraphBuilder &cgb, tensor_guid_t t = input; t = create_conv_block(cgb, t, - /*filters=*/64_n, - /*kernel_size_h=*/1_n, - /*kernel_size_w=*/1_n); + /*filters=*/64_p, + /*kernel_size_h=*/1_p, + /*kernel_size_w=*/1_p); t = create_conv_block(cgb, t, - /*filters=*/96_n, - /*kernel_size_h=*/3_n, - /*kernel_size_w=*/3_n, - /*stride_h=*/1_n, - /*stride_w=*/1_n, + /*filters=*/96_p, + /*kernel_size_h=*/3_p, + /*kernel_size_w=*/3_p, + /*stride_h=*/1_p, + /*stride_w=*/1_p, /*padding_h=*/1_n, /*padding_w=*/1_n); t = create_conv_block(cgb, t, - /*filters=*/96_n, - /*kernel_size_h=*/3_n, - /*kernel_size_w=*/3_n, - /*stride_h=*/1_n, - /*stride_w=*/1_n, + /*filters=*/96_p, + /*kernel_size_h=*/3_p, + /*kernel_size_w=*/3_p, + /*stride_h=*/1_p, + /*stride_w=*/1_p, /*padding_h=*/1_n, /*padding_w=*/1_n); return t; @@ -158,18 +158,18 @@ static tensor_guid_t create_inception_module_a(ComputationGraphBuilder &cgb, tensor_guid_t branch_pool = [&] { tensor_guid_t t = input; t = cgb.pool2d(t, - /*kernelH=*/3_n, - /*kernelW=*/3_n, - /*strideH=*/1_n, - /*strideW=*/1_n, + /*kernelH=*/3_p, + /*kernelW=*/3_p, + /*strideH=*/1_p, + /*strideW=*/1_p, /*paddingH=*/1_n, /*paddingW=*/1_n, /*type=*/PoolOp::AVG); t = create_conv_block(cgb, t, /*filters=*/pool_features, - /*kernel_stride_h=*/1_n, - /*kernel_stride_w=*/1_n); + /*kernel_stride_h=*/1_p, + /*kernel_stride_w=*/1_p); return t; }(); @@ -181,43 +181,43 @@ static tensor_guid_t create_inception_module_b(ComputationGraphBuilder &cgb, tensor_guid_t const &input) { tensor_guid_t branch3x3 = create_conv_block(cgb, input, - /*filters=*/384_n, - /*kernel_size_h=*/3_n, - /*kernel_size_w=*/3_n, - /*stride_h=*/2_n, - /*stride_w=*/2_n); + /*filters=*/384_p, + /*kernel_size_h=*/3_p, + /*kernel_size_w=*/3_p, + /*stride_h=*/2_p, + /*stride_w=*/2_p); tensor_guid_t branch3x3dbl = [&] { tensor_guid_t t = input; t = create_conv_block(cgb, t, - /*filters=*/64_n, - /*kernel_size_h=*/1_n, - /*kernel_size_w=*/1_n); + /*filters=*/64_p, + /*kernel_size_h=*/1_p, + /*kernel_size_w=*/1_p); t = create_conv_block(cgb, t, - /*filters=*/96_n, - /*kernel_size_h=*/3_n, - /*kernel_size_w=*/3_n, - /*stride_h=*/1_n, - /*stride_w=*/1_n, + /*filters=*/96_p, + /*kernel_size_h=*/3_p, + /*kernel_size_w=*/3_p, + /*stride_h=*/1_p, + /*stride_w=*/1_p, /*padding_h=*/1_n, /*padding_w=*/1_n); t = create_conv_block(cgb, t, - /*filters=*/96_n, - /*kernel_stride_h=*/3_n, - /*kernel_stride_w=*/3_n, - /*stride_h=*/2_n, - /*stride_w=*/2_n); + /*filters=*/96_p, + /*kernel_stride_h=*/3_p, + /*kernel_stride_w=*/3_p, + /*stride_h=*/2_p, + /*stride_w=*/2_p); return t; }(); tensor_guid_t branch_pool = cgb.pool2d(input, - /*kernelH=*/3_n, - /*kernelW=*/3_n, - /*strideH=*/2_n, - /*strideW=*/2_n, + /*kernelH=*/3_p, + /*kernelW=*/3_p, + /*strideH=*/2_p, + /*strideW=*/2_p, /*paddingH=*/0_n, /*paddingW=*/0_n, /*type=*/PoolOp::MAX); @@ -229,108 +229,108 @@ static tensor_guid_t create_inception_module_b(ComputationGraphBuilder &cgb, static tensor_guid_t create_inception_module_c(ComputationGraphBuilder &cgb, CheckShape const &check_shape, tensor_guid_t const &input, - nonnegative_int channels_7x7) { + positive_int channels_7x7) { tensor_guid_t branch1x1 = create_conv_block(cgb, input, - /*filters=*/192_n, - /*kernel_size_h=*/1_n, - /*kernel_size_w=*/1_n); - check_shape(branch1x1, 192_n, 17_n, 17_n); + /*filters=*/192_p, + /*kernel_size_h=*/1_p, + /*kernel_size_w=*/1_p); + check_shape(branch1x1, 192_p, 17_p, 17_p); tensor_guid_t branch7x7 = [&] { tensor_guid_t t = input; t = create_conv_block(cgb, t, /*filters=*/channels_7x7, - /*kernel_size_h=*/1_n, - /*kernel_size_w=*/1_n); + /*kernel_size_h=*/1_p, + /*kernel_size_w=*/1_p); t = create_conv_block(cgb, t, /*filters=*/channels_7x7, - /*kernel_size_h=*/1_n, - /*kernel_size_w=*/7_n, - /*stride_h=*/1_n, - /*stride_w=*/1_n, + /*kernel_size_h=*/1_p, + /*kernel_size_w=*/7_p, + /*stride_h=*/1_p, + /*stride_w=*/1_p, /*padding_h=*/0_n, /*padding_w=*/3_n); t = create_conv_block(cgb, t, - /*filters=*/192_n, - /*kernel_size_h=*/7_n, - /*kernel_size_w=*/1_n, - /*stride_h=*/1_n, - /*stride_w=*/1_n, + /*filters=*/192_p, + /*kernel_size_h=*/7_p, + /*kernel_size_w=*/1_p, + /*stride_h=*/1_p, + /*stride_w=*/1_p, /*padding_h=*/3_n, /*padding_w=*/0_n); return t; }(); - check_shape(branch7x7, 192_n, 17_n, 17_n); + check_shape(branch7x7, 192_p, 17_p, 17_p); tensor_guid_t branch7x7dbl = [&] { tensor_guid_t t = input; t = create_conv_block(cgb, t, /*filters=*/channels_7x7, - /*kernel_size_h=*/1_n, - /*kernel_size_w=*/1_n); + /*kernel_size_h=*/1_p, + /*kernel_size_w=*/1_p); t = create_conv_block(cgb, t, /*filters=*/channels_7x7, - /*kernel_size_h=*/7_n, - /*kernel_size_w=*/1_n, - /*stride_h=*/1_n, - /*stride_w=*/1_n, + /*kernel_size_h=*/7_p, + /*kernel_size_w=*/1_p, + /*stride_h=*/1_p, + /*stride_w=*/1_p, /*padding_h=*/3_n, /*padding_w=*/0_n); t = create_conv_block(cgb, t, /*filters=*/channels_7x7, - /*kernel_size_h=*/1_n, - /*kernel_size_w=*/7_n, - /*stride_h=*/1_n, - /*stride_w=*/1_n, + /*kernel_size_h=*/1_p, + /*kernel_size_w=*/7_p, + /*stride_h=*/1_p, + /*stride_w=*/1_p, /*padding_h=*/0_n, /*padding_w=*/3_n); t = create_conv_block(cgb, t, /*filters=*/channels_7x7, - /*kernel_size_h=*/7_n, - /*kernel_size_w=*/1_n, - /*stride_h=*/1_n, - /*stride_w=*/1_n, + /*kernel_size_h=*/7_p, + /*kernel_size_w=*/1_p, + /*stride_h=*/1_p, + /*stride_w=*/1_p, /*padding_h=*/3_n, /*padding_w=*/0_n); t = create_conv_block(cgb, t, - /*filters=*/192_n, - /*kernel_size_h=*/1_n, - /*kernel_size_w=*/7_n, - /*stride_h=*/1_n, - /*stride_w=*/1_n, + /*filters=*/192_p, + /*kernel_size_h=*/1_p, + /*kernel_size_w=*/7_p, + /*stride_h=*/1_p, + /*stride_w=*/1_p, /*padding_h=*/0_n, /*padding_w=*/3_n); return t; }(); - check_shape(branch7x7dbl, 192_n, 17_n, 17_n); + check_shape(branch7x7dbl, 192_p, 17_p, 17_p); tensor_guid_t branch_pool = [&] { tensor_guid_t t = input; t = cgb.pool2d(t, - /*kernelH=*/3_n, - /*kernelW=*/3_n, - /*strideH=*/1_n, - /*strideW=*/1_n, + /*kernelH=*/3_p, + /*kernelW=*/3_p, + /*strideH=*/1_p, + /*strideW=*/1_p, /*paddingH=*/1_n, /*paddingW=*/1_n, /*type=*/PoolOp::AVG); t = create_conv_block(cgb, t, - /*filters=*/192_n, - /*kernel_size_h=*/1_n, - /*kernel_size_w=*/1_n); + /*filters=*/192_p, + /*kernel_size_h=*/1_p, + /*kernel_size_w=*/1_p); return t; }(); - check_shape(branch_pool, 192_n, 17_n, 17_n); + check_shape(branch_pool, 192_p, 17_p, 17_p); return cgb.concat({branch1x1, branch7x7, branch7x7dbl, branch_pool}, /*axis=*/relative_ff_dim_t{1}); @@ -342,10 +342,10 @@ static tensor_guid_t create_inception_module_d(ComputationGraphBuilder &cgb, tensor_guid_t t = input; t = create_conv_block(cgb, t, - /*filters=*/192_n, - /*kernel_size_h=*/1_n, - /*kernel_size_w=*/1_n); - t = create_conv_block(cgb, t, 320_n, 3_n, 3_n, 2_n, 2_n); + /*filters=*/192_p, + /*kernel_size_h=*/1_p, + /*kernel_size_w=*/1_p); + t = create_conv_block(cgb, t, 320_p, 3_p, 3_p, 2_p, 2_p); return t; }(); @@ -353,42 +353,42 @@ static tensor_guid_t create_inception_module_d(ComputationGraphBuilder &cgb, tensor_guid_t t = input; t = create_conv_block(cgb, t, - /*filters=*/192_n, - /*kernel_size_h=*/1_n, - /*kernel_size_w=*/1_n); + /*filters=*/192_p, + /*kernel_size_h=*/1_p, + /*kernel_size_w=*/1_p); t = create_conv_block(cgb, t, - /*filters=*/192_n, - /*kernel_size_h=*/1_n, - /*kernel_size_w=*/7_n, - /*stride_h=*/1_n, - /*stride_w=*/1_n, + /*filters=*/192_p, + /*kernel_size_h=*/1_p, + /*kernel_size_w=*/7_p, + /*stride_h=*/1_p, + /*stride_w=*/1_p, /*padding_h=*/0_n, /*padding_w=*/3_n); t = create_conv_block(cgb, t, - /*filters=*/192_n, - /*kernel_size_h=*/7_n, - /*kernel_size_w=*/1_n, - /*stride_h=*/1_n, - /*stride_w=*/1_n, + /*filters=*/192_p, + /*kernel_size_h=*/7_p, + /*kernel_size_w=*/1_p, + /*stride_h=*/1_p, + /*stride_w=*/1_p, /*padding_h=*/3_n, /*padding_w=*/0_n); t = create_conv_block(cgb, t, - /*filters=*/192_n, - /*kernel_size_h=*/3_n, - /*kernel_size_w=*/3_n, - /*stride_h=*/2_n, - /*stride_w=*/2_n); + /*filters=*/192_p, + /*kernel_size_h=*/3_p, + /*kernel_size_w=*/3_p, + /*stride_h=*/2_p, + /*stride_w=*/2_p); return t; }(); tensor_guid_t branch_pool = cgb.pool2d(input, - /*kernelH=*/3_n, - /*kernelW=*/3_n, - /*strideH=*/2_n, - /*strideW=*/2_n, + /*kernelH=*/3_p, + /*kernelW=*/3_p, + /*strideH=*/2_p, + /*strideW=*/2_p, /*paddingH=*/0_n, /*paddingW=*/0_n, /*type=*/PoolOp::MAX); @@ -401,33 +401,33 @@ static tensor_guid_t create_inception_module_e(ComputationGraphBuilder &cgb, tensor_guid_t const &input) { tensor_guid_t branch1x1 = create_conv_block(cgb, input, - /*filters=*/320_n, - /*kernel_size_h=*/1_n, - /*kernel_size_w=*/1_n); + /*filters=*/320_p, + /*kernel_size_h=*/1_p, + /*kernel_size_w=*/1_p); tensor_guid_t branch3x3 = [&] { tensor_guid_t t = input; t = create_conv_block(cgb, t, - /*filters=*/384_n, - /*kernel_size_h=*/1_n, - /*kernel_size_w=*/1_n); + /*filters=*/384_p, + /*kernel_size_h=*/1_p, + /*kernel_size_w=*/1_p); tensor_guid_t t_1 = create_conv_block(cgb, t, - /*filters=*/384_n, - /*kernel_size_h=*/1_n, - /*kernel_size_w=*/3_n, - /*stride_h=*/1_n, - /*stride_w=*/1_n, + /*filters=*/384_p, + /*kernel_size_h=*/1_p, + /*kernel_size_w=*/3_p, + /*stride_h=*/1_p, + /*stride_w=*/1_p, /*padding_h=*/0_n, /*padding_w=*/1_n); tensor_guid_t t_2 = create_conv_block(cgb, t, - /*filters=*/384_n, - /*kernel_size_h=*/3_n, - /*kernel_size_w=*/1_n, - /*stride_h=*/1_n, - /*stride_w=*/1_n, + /*filters=*/384_p, + /*kernel_size_h=*/3_p, + /*kernel_size_w=*/1_p, + /*stride_h=*/1_p, + /*stride_w=*/1_p, /*padding_h=*/1_n, /*padding_w=*/0_n); t = cgb.concat({t_1, t_2}, /*axis=*/relative_ff_dim_t{1}); @@ -438,34 +438,34 @@ static tensor_guid_t create_inception_module_e(ComputationGraphBuilder &cgb, tensor_guid_t t = input; t = create_conv_block(cgb, t, - /*filters=*/448_n, - /*kernel_size_h=*/1_n, - /*kernel_size_w=*/1_n); + /*filters=*/448_p, + /*kernel_size_h=*/1_p, + /*kernel_size_w=*/1_p); t = create_conv_block(cgb, t, - /*filters=*/384_n, - /*kernel_size_h=*/3_n, - /*kernel_size_w=*/3_n, - /*stride_h=*/1_n, - /*stride_w=*/1_n, + /*filters=*/384_p, + /*kernel_size_h=*/3_p, + /*kernel_size_w=*/3_p, + /*stride_h=*/1_p, + /*stride_w=*/1_p, /*padding_h=*/1_n, /*padding_w=*/1_n); tensor_guid_t t_1 = create_conv_block(cgb, t, - /*filters=*/384_n, - /*kernel_size_h=*/1_n, - /*kernel_size_w=*/3_n, - /*stride_h=*/1_n, - /*stride_w=*/1_n, + /*filters=*/384_p, + /*kernel_size_h=*/1_p, + /*kernel_size_w=*/3_p, + /*stride_h=*/1_p, + /*stride_w=*/1_p, /*padding_h=*/0_n, /*padding_w=*/1_n); tensor_guid_t t_2 = create_conv_block(cgb, t, - /*filters=*/384_n, - /*kernel_size_h=*/3_n, - /*kernel_size_w=*/1_n, - /*stride_h=*/1_n, - /*stride_w=*/1_n, + /*filters=*/384_p, + /*kernel_size_h=*/3_p, + /*kernel_size_w=*/1_p, + /*stride_h=*/1_p, + /*stride_w=*/1_p, /*padding_h=*/1_n, /*padding_w=*/0_n); t = cgb.concat({t_1, t_2}, /*axis=*/relative_ff_dim_t{1}); @@ -475,18 +475,18 @@ static tensor_guid_t create_inception_module_e(ComputationGraphBuilder &cgb, tensor_guid_t branch_pool = [&] { tensor_guid_t t = input; t = cgb.pool2d(t, - /*kernelH=*/3_n, - /*kernelW=*/3_n, - /*strideH=*/1_n, - /*strideW=*/1_n, + /*kernelH=*/3_p, + /*kernelW=*/3_p, + /*strideH=*/1_p, + /*strideW=*/1_p, /*paddingH=*/1_n, /*paddingW=*/1_n, /*type=*/PoolOp::AVG); t = create_conv_block(cgb, t, - /*filters=*/192_n, - /*kernel_size_h=*/1_n, - /*kernel_size_w=*/1_n); + /*filters=*/192_p, + /*kernel_size_h=*/1_p, + /*kernel_size_w=*/1_p); return t; }(); @@ -499,75 +499,75 @@ static tensor_guid_t create_initial_layers(ComputationGraphBuilder &cgb, tensor_guid_t const &input) { tensor_guid_t t = input; - check_shape(t, 3_n, 299_n, 299_n); + check_shape(t, 3_p, 299_p, 299_p); // Conv2d_1a_3x3 t = create_conv_block(cgb, t, - /*filters=*/32_n, - /*kernel_size_h=*/3_n, - /*kernel_size_w=*/3_n, - /*stride_h=*/2_n, - /*stride_w=*/2_n); - check_shape(t, 32_n, 149_n, 149_n); + /*filters=*/32_p, + /*kernel_size_h=*/3_p, + /*kernel_size_w=*/3_p, + /*stride_h=*/2_p, + /*stride_w=*/2_p); + check_shape(t, 32_p, 149_p, 149_p); // Conv2d_2a_3x3 t = create_conv_block(cgb, t, - /*filters=*/32_n, - /*kernel_size_h=*/3_n, - /*kernel_size_w=*/3_n); - check_shape(t, 32_n, 147_n, 147_n); + /*filters=*/32_p, + /*kernel_size_h=*/3_p, + /*kernel_size_w=*/3_p); + check_shape(t, 32_p, 147_p, 147_p); // Conv2d_2b_3x3 t = create_conv_block(cgb, t, - /*filters=*/64_n, - /*kernel_size_h=*/3_n, - /*kernel_size_w=*/3_n, - /*stride_h=*/1_n, - /*stride_w=*/1_n, + /*filters=*/64_p, + /*kernel_size_h=*/3_p, + /*kernel_size_w=*/3_p, + /*stride_h=*/1_p, + /*stride_w=*/1_p, /*padding_h=*/1_n, /*padding_w=*/1_n); - check_shape(t, 64_n, 147_n, 147_n); + check_shape(t, 64_p, 147_p, 147_p); // maxpool1 t = cgb.pool2d(t, - /*kernelH=*/3_n, - /*kernelW=*/3_n, - /*strideH=*/2_n, - /*strideW=*/2_n, + /*kernelH=*/3_p, + /*kernelW=*/3_p, + /*strideH=*/2_p, + /*strideW=*/2_p, /*paddingH=*/0_n, /*paddingW=*/0_n, /*type=*/PoolOp::MAX); - check_shape(t, 64_n, 73_n, 73_n); + check_shape(t, 64_p, 73_p, 73_p); // Conv2d_3b_1x1 t = create_conv_block(cgb, t, - /*filters=*/80_n, - /*kernel_size_h=*/1_n, - /*kernel_size_w=*/1_n); - check_shape(t, 80_n, 73_n, 73_n); + /*filters=*/80_p, + /*kernel_size_h=*/1_p, + /*kernel_size_w=*/1_p); + check_shape(t, 80_p, 73_p, 73_p); // Conv2d_4a_3x3 t = create_conv_block(cgb, t, - /*filters=*/192_n, - /*kernel_size_h=*/3_n, - /*kernel_size_w=*/3_n); - check_shape(t, 192_n, 71_n, 71_n); + /*filters=*/192_p, + /*kernel_size_h=*/3_p, + /*kernel_size_w=*/3_p); + check_shape(t, 192_p, 71_p, 71_p); // maxpool2 t = cgb.pool2d(t, - /*kernelH=*/3_n, - /*kernelW=*/3_n, - /*strideH=*/2_n, - /*strideW=*/2_n, + /*kernelH=*/3_p, + /*kernelW=*/3_p, + /*strideH=*/2_p, + /*strideW=*/2_p, /*paddingH=*/0_n, /*paddingW=*/0_n, /*type=*/PoolOp::MAX); - check_shape(t, 192_n, 35_n, 35_n); + check_shape(t, 192_p, 35_p, 35_p); return t; } @@ -575,26 +575,26 @@ static tensor_guid_t create_initial_layers(ComputationGraphBuilder &cgb, static tensor_guid_t create_final_layers(ComputationGraphBuilder &cgb, CheckShape const &check_shape, tensor_guid_t const &input, - nonnegative_int num_classes) { + positive_int num_classes) { // avgpool tensor_guid_t x = cgb.pool2d(input, - /*kernelH=*/8_n, - /*kernelW=*/8_n, - /*strideH=*/1_n, - /*strideW=*/1_n, + /*kernelH=*/8_p, + /*kernelW=*/8_p, + /*strideH=*/1_p, + /*strideW=*/1_p, /*paddingH=*/0_n, /*paddingW=*/0_n, /*type=*/PoolOp::AVG); - check_shape(x, 2048_n, 1_n, 1_n); + check_shape(x, 2048_p, 1_p, 1_p); // dropout x = cgb.dropout(x, /*rate=*/0.5); - check_shape(x, 2048_n, 1_n, 1_n); + check_shape(x, 2048_p, 1_p, 1_p); x = cgb.flat(x, /*start_dim=*/relative_ff_dim_t{1}); - check_shape(x, 2048_n); + check_shape(x, 2048_p); // fc x = cgb.dense(x, @@ -602,7 +602,7 @@ static tensor_guid_t create_final_layers(ComputationGraphBuilder &cgb, check_shape(x, num_classes); // softmax (not in pytorch model, but shown in Table 1 on p6 of - // https://arxiv.org/abs/1512.00567_n) + // https://arxiv.org/abs/1512.00567) x = cgb.softmax(x); check_shape(x, num_classes); @@ -612,44 +612,44 @@ static tensor_guid_t create_final_layers(ComputationGraphBuilder &cgb, static tensor_guid_t create_inception_aux(ComputationGraphBuilder &cgb, CheckShape const &check_shape, tensor_guid_t const &input, - nonnegative_int num_classes) { + positive_int num_classes) { tensor_guid_t x = input; - check_shape(x, 768_n, 17_n, 17_n); + check_shape(x, 768_p, 17_p, 17_p); x = cgb.pool2d(x, - /*kernelH=*/5_n, - /*kernelW=*/5_n, - /*strideH=*/3_n, - /*strideW=*/3_n, + /*kernelH=*/5_p, + /*kernelW=*/5_p, + /*strideH=*/3_p, + /*strideW=*/3_p, /*paddingH=*/0_n, /*paddingW=*/0_n, /*type=*/PoolOp::AVG); - check_shape(x, 768_n, 5_n, 5_n); + check_shape(x, 768_p, 5_p, 5_p); // conv0 x = create_conv_block(cgb, x, - /*filters=*/128_n, - /*kernel_size_h=*/1_n, - /*kernel_size_w=*/1_n); - check_shape(x, 128_n, 5_n, 5_n); + /*filters=*/128_p, + /*kernel_size_h=*/1_p, + /*kernel_size_w=*/1_p); + check_shape(x, 128_p, 5_p, 5_p); // conv1 x = create_conv_block(cgb, x, - /*filters=*/768_n, - /*kernel_size_h=*/5_n, - /*kernel_size_w=*/5_n); - check_shape(x, 768_n, 1_n, 1_n); + /*filters=*/768_p, + /*kernel_size_h=*/5_p, + /*kernel_size_w=*/5_p); + check_shape(x, 768_p, 1_p, 1_p); x = cgb.adaptive_pool2d(x, - /*output_h=*/1_n, - /*output_w=*/1_n); - check_shape(x, 768_n, 1_n, 1_n); + /*output_h=*/1_p, + /*output_w=*/1_p); + check_shape(x, 768_p, 1_p, 1_p); x = cgb.flat(x, /*start_dim=*/relative_ff_dim_t{1}); - check_shape(x, 768_n); + check_shape(x, 768_p); // fc x = cgb.dense(x, @@ -671,39 +671,39 @@ static InceptionV3Output create_inception_v3(ComputationGraphBuilder &cgb, }; tensor_guid_t x = create_initial_layers(cgb, check_shape, input); - check_shape(x, 192_n, 35_n, 35_n); + check_shape(x, 192_p, 35_p, 35_p); // Mixed_5b - x = create_inception_module_a(cgb, x, 32_n); - check_shape(x, 256_n, 35_n, 35_n); + x = create_inception_module_a(cgb, x, 32_p); + check_shape(x, 256_p, 35_p, 35_p); // Mixed_5c - x = create_inception_module_a(cgb, x, 64_n); - check_shape(x, 288_n, 35_n, 35_n); + x = create_inception_module_a(cgb, x, 64_p); + check_shape(x, 288_p, 35_p, 35_p); // Mixed_5d - x = create_inception_module_a(cgb, x, 64_n); - check_shape(x, 288_n, 35_n, 35_n); + x = create_inception_module_a(cgb, x, 64_p); + check_shape(x, 288_p, 35_p, 35_p); // Mixed_6a x = create_inception_module_b(cgb, x); - check_shape(x, 768_n, 17_n, 17_n); + check_shape(x, 768_p, 17_p, 17_p); // Mixed_6b - x = create_inception_module_c(cgb, check_shape, x, 128_n); - check_shape(x, 768_n, 17_n, 17_n); + x = create_inception_module_c(cgb, check_shape, x, 128_p); + check_shape(x, 768_p, 17_p, 17_p); // Mixed_6c - x = create_inception_module_c(cgb, check_shape, x, 160_n); - check_shape(x, 768_n, 17_n, 17_n); + x = create_inception_module_c(cgb, check_shape, x, 160_p); + check_shape(x, 768_p, 17_p, 17_p); // Mixed_6d - x = create_inception_module_c(cgb, check_shape, x, 160_n); - check_shape(x, 768_n, 17_n, 17_n); + x = create_inception_module_c(cgb, check_shape, x, 160_p); + check_shape(x, 768_p, 17_p, 17_p); // Mixed_6e - x = create_inception_module_c(cgb, check_shape, x, 192_n); - check_shape(x, 768_n, 17_n, 17_n); + x = create_inception_module_c(cgb, check_shape, x, 192_p); + check_shape(x, 768_p, 17_p, 17_p); std::optional aux; if (config.aux_logits) { @@ -713,15 +713,15 @@ static InceptionV3Output create_inception_v3(ComputationGraphBuilder &cgb, // Mixed_7a x = create_inception_module_d(cgb, x); - check_shape(x, 1280_n, 8_n, 8_n); + check_shape(x, 1280_p, 8_p, 8_p); // Mixed_7b x = create_inception_module_e(cgb, x); - check_shape(x, 2048_n, 8_n, 8_n); + check_shape(x, 2048_p, 8_p, 8_p); // Mixed_7c x = create_inception_module_e(cgb, x); - check_shape(x, 2048_n, 8_n, 8_n); + check_shape(x, 2048_p, 8_p, 8_p); x = create_final_layers(cgb, check_shape, x, config.num_classes); check_shape(x, config.num_classes); @@ -737,11 +737,11 @@ ComputationGraph ComputationGraphBuilder cgb; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ config.batch_size, - 3_n, - 299_n, - 299_n, + 3_p, + 299_p, + 299_p, }}, DataType::FLOAT, }; diff --git a/lib/models/src/models/split_test/split_test.cc b/lib/models/src/models/split_test/split_test.cc index d3876d8bfc..67d2f74ce0 100644 --- a/lib/models/src/models/split_test/split_test.cc +++ b/lib/models/src/models/split_test/split_test.cc @@ -4,16 +4,16 @@ namespace FlexFlow { -ComputationGraph get_split_test_computation_graph(nonnegative_int batch_size) { +ComputationGraph get_split_test_computation_graph(positive_int batch_size) { ComputationGraphBuilder cgb; - nonnegative_int layer_dim1 = 256_n; - nonnegative_int layer_dim2 = 128_n; - nonnegative_int layer_dim3 = 64_n; - nonnegative_int layer_dim4 = 32_n; + positive_int layer_dim1 = 256_p; + positive_int layer_dim2 = 128_p; + positive_int layer_dim3 = 64_p; + positive_int layer_dim4 = 32_p; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ batch_size, layer_dim1, }}, diff --git a/lib/models/src/models/transformer/transformer.cc b/lib/models/src/models/transformer/transformer.cc index f71763313a..dfc40a5720 100644 --- a/lib/models/src/models/transformer/transformer.cc +++ b/lib/models/src/models/transformer/transformer.cc @@ -4,16 +4,16 @@ namespace FlexFlow { TransformerConfig get_default_transformer_config() { - return TransformerConfig{/*num_features=*/512_n, - /*sequence_length=*/512_n, - /*batch_size=*/64_n, - /*dim_feedforward=*/2048_n, - /*num_heads=*/8_n, - /*num_encoder_layers=*/6_n, - /*num_decoder_layers=*/6_n, + return TransformerConfig{/*num_features=*/512_p, + /*sequence_length=*/512_p, + /*batch_size=*/64_p, + /*dim_feedforward=*/2048_p, + /*num_heads=*/8_p, + /*num_encoder_layers=*/6_p, + /*num_decoder_layers=*/6_p, /*dropout=*/0.1, /*layer_norm_eps=*/1e-05, - /*vocab_size=*/64_n}; + /*vocab_size=*/64_p}; } tensor_guid_t create_feedforward_network(ComputationGraphBuilder &cgb, @@ -34,8 +34,8 @@ tensor_guid_t create_transformer_encoder_layer(ComputationGraphBuilder &cgb, tensor_guid_t const &input) { std::vector layer_norm_axis = { relative_ff_dim_t{-1}}; // Normalize the last dim - nonnegative_int kdim = config.dim_feedforward / config.num_heads; - nonnegative_int vdim = config.dim_feedforward / config.num_heads; + positive_int kdim = positive_int{config.dim_feedforward / config.num_heads}; + positive_int vdim = positive_int{config.dim_feedforward / config.num_heads}; tensor_guid_t self_attention = cgb.multihead_attention(/*query=*/input, /*key=*/input, @@ -83,8 +83,8 @@ tensor_guid_t tensor_guid_t const &encoder_output) { std::vector layer_norm_axis = { relative_ff_dim_t{-1}}; // Normalize the last dim - nonnegative_int kdim = config.dim_feedforward / config.num_heads; - nonnegative_int vdim = config.dim_feedforward / config.num_heads; + positive_int kdim = positive_int{config.dim_feedforward / config.num_heads}; + positive_int vdim = positive_int{config.dim_feedforward / config.num_heads}; tensor_guid_t self_attention = cgb.multihead_attention(/*query=*/input, /*key=*/input, @@ -153,7 +153,7 @@ ComputationGraph ComputationGraphBuilder cgb; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ config.batch_size, config.sequence_length, config.num_features}}, DataType::FLOAT, }; diff --git a/lib/op-attrs/include/op-attrs/datatype.h b/lib/op-attrs/include/op-attrs/datatype.h index 9996e36482..62f7ccd4f9 100644 --- a/lib/op-attrs/include/op-attrs/datatype.h +++ b/lib/op-attrs/include/op-attrs/datatype.h @@ -4,7 +4,7 @@ #include "op-attrs/datatype.dtg.h" #include "utils/fmt.h" #include "utils/fp16.h" -#include "utils/nonnegative_int/nonnegative_int.h" +#include "utils/positive_int/positive_int.h" #include namespace FlexFlow { @@ -40,7 +40,7 @@ template struct type_to_data_type_enum; template <> -struct type_to_data_type_enum +struct type_to_data_type_enum : std::integral_constant {}; template <> @@ -74,7 +74,7 @@ typename data_type_enum_to_class
::type cast_to(T t) { template using real_type_t = typename data_type_enum_to_class
::type; -nonnegative_int size_of_datatype(DataType); +positive_int size_of_datatype(DataType); /** * @brief Maximally semantics-preserving casts, not including identity diff --git a/lib/op-attrs/include/op-attrs/datatype_value.h b/lib/op-attrs/include/op-attrs/datatype_value.h index 723e69bddd..b646692de9 100644 --- a/lib/op-attrs/include/op-attrs/datatype_value.h +++ b/lib/op-attrs/include/op-attrs/datatype_value.h @@ -1,6 +1,7 @@ #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DATATYPE_VALUE_H #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DATATYPE_VALUE_H +#include "op-attrs/datatype.dtg.h" #include "op-attrs/datatype_value.dtg.h" namespace FlexFlow { @@ -11,6 +12,8 @@ DataTypeValue make_int32_data_type_value(int32_t value); DataTypeValue make_int64_data_type_value(int64_t value); DataTypeValue make_bool_data_type_value(bool value); +DataType get_data_type_of_data_type_value(DataTypeValue); + } // namespace FlexFlow #endif // _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_MAKE_DATATYPE_VALUE_H diff --git a/lib/op-attrs/include/op-attrs/initializers/kaiming_initializer_mode.h b/lib/op-attrs/include/op-attrs/initializers/kaiming_initializer_mode.h index bd95ff677c..f619f94e20 100644 --- a/lib/op-attrs/include/op-attrs/initializers/kaiming_initializer_mode.h +++ b/lib/op-attrs/include/op-attrs/initializers/kaiming_initializer_mode.h @@ -3,7 +3,7 @@ #include "op-attrs/initializers/kaiming_initializer_mode.dtg.h" #include "op-attrs/tensor_dims.dtg.h" -#include "utils/nonnegative_int/nonnegative_int.h" +#include "utils/positive_int/positive_int.h" namespace FlexFlow { @@ -13,7 +13,7 @@ namespace FlexFlow { * see * https://github.com/pytorch/pytorch/blob/bd019c0bb485904a99fb38589444b1461ab1e486/torch/nn/init.py#L345-L363 */ -nonnegative_int calculate_fan_for_mode(TensorDims const &dims, +positive_int calculate_fan_for_mode(TensorDims const &dims, KaimingInitializerMode mode); } // namespace FlexFlow diff --git a/lib/op-attrs/include/op-attrs/ops/attention.h b/lib/op-attrs/include/op-attrs/ops/attention.h index fa57a717e2..5ca237561f 100644 --- a/lib/op-attrs/include/op-attrs/ops/attention.h +++ b/lib/op-attrs/include/op-attrs/ops/attention.h @@ -13,31 +13,31 @@ namespace FlexFlow { -nonnegative_int get_qProjSize(MultiHeadAttentionAttrs const &); -nonnegative_int get_vProjSize(MultiHeadAttentionAttrs const &); -nonnegative_int get_kProjSize(MultiHeadAttentionAttrs const &); -nonnegative_int get_oProjSize(MultiHeadAttentionAttrs const &); +positive_int get_qProjSize(MultiHeadAttentionAttrs const &); +positive_int get_vProjSize(MultiHeadAttentionAttrs const &); +positive_int get_kProjSize(MultiHeadAttentionAttrs const &); +positive_int get_oProjSize(MultiHeadAttentionAttrs const &); -nonnegative_int get_qSize(MultiHeadAttentionParallelInputs const &); -nonnegative_int get_qSize(MultiHeadAttentionInputs const &); +positive_int get_qSize(MultiHeadAttentionParallelInputs const &); +positive_int get_qSize(MultiHeadAttentionInputs const &); -nonnegative_int get_kSize(MultiHeadAttentionParallelInputs const &); -nonnegative_int get_kSize(MultiHeadAttentionInputs const &); +positive_int get_kSize(MultiHeadAttentionParallelInputs const &); +positive_int get_kSize(MultiHeadAttentionInputs const &); -nonnegative_int get_vSize(MultiHeadAttentionParallelInputs const &); -nonnegative_int get_vSize(MultiHeadAttentionInputs const &); +positive_int get_vSize(MultiHeadAttentionParallelInputs const &); +positive_int get_vSize(MultiHeadAttentionInputs const &); -nonnegative_int get_oSize(ParallelTensorShape const &); -nonnegative_int get_oSize(TensorShape const &); +positive_int get_oSize(ParallelTensorShape const &); +positive_int get_oSize(TensorShape const &); -nonnegative_int get_qoSeqLength(MultiHeadAttentionParallelInputs const &); -nonnegative_int get_qoSeqLength(MultiHeadAttentionInputs const &); +positive_int get_qoSeqLength(MultiHeadAttentionParallelInputs const &); +positive_int get_qoSeqLength(MultiHeadAttentionInputs const &); -nonnegative_int get_kvSeqLength(MultiHeadAttentionParallelInputs const &); -nonnegative_int get_kvSeqLength(MultiHeadAttentionInputs const &); +positive_int get_kvSeqLength(MultiHeadAttentionParallelInputs const &); +positive_int get_kvSeqLength(MultiHeadAttentionInputs const &); -nonnegative_int get_num_samples(MultiHeadAttentionParallelInputs const &); -nonnegative_int get_num_samples(MultiHeadAttentionInputs const &); +positive_int get_num_samples(MultiHeadAttentionParallelInputs const &); +positive_int get_num_samples(MultiHeadAttentionInputs const &); std::vector get_attention_incoming_tensor_roles(MultiHeadAttentionAttrs const &); diff --git a/lib/op-attrs/include/op-attrs/ops/attention/multihead_attention_inputs.struct.toml b/lib/op-attrs/include/op-attrs/ops/attention/multihead_attention_inputs.struct.toml index f85b7268af..8b9aefb67e 100644 --- a/lib/op-attrs/include/op-attrs/ops/attention/multihead_attention_inputs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/attention/multihead_attention_inputs.struct.toml @@ -11,28 +11,28 @@ features = [ includes = [ "op-attrs/datatype.dtg.h", - "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", ] [[fields]] name = "batch_size" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "sequence_length" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "query_size" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "key_size" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "value_size" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "datatype" diff --git a/lib/op-attrs/include/op-attrs/ops/attention_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/attention_attrs.struct.toml index 019131b07c..b9c6847cd6 100644 --- a/lib/op-attrs/include/op-attrs/ops/attention_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/attention_attrs.struct.toml @@ -11,24 +11,24 @@ features = [ ] includes = [ - "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", ] [[fields]] name = "embed_dim" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "num_heads" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "kdim" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "vdim" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "dropout" diff --git a/lib/op-attrs/include/op-attrs/ops/combine_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/combine_attrs.struct.toml index b3c574264c..d80f853b00 100644 --- a/lib/op-attrs/include/op-attrs/ops/combine_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/combine_attrs.struct.toml @@ -12,7 +12,7 @@ features = [ includes = [ "op-attrs/ff_dim_t.h", "op-attrs/ff_dim_t.dtg.h", - "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", ] [[fields]] @@ -21,4 +21,4 @@ type = "::FlexFlow::ff_dim_t" [[fields]] name = "combine_degree" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" diff --git a/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_input_shape.struct.toml b/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_input_shape.struct.toml index c4fb74ebd8..b81acbfadd 100644 --- a/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_input_shape.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_input_shape.struct.toml @@ -12,24 +12,24 @@ features = [ includes = [ "", "op-attrs/datatype.dtg.h", - "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", ] [[fields]] name = "num_samples" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "num_channels" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "height" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "width" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "datatype" diff --git a/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_parallel_input_shape.struct.toml b/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_parallel_input_shape.struct.toml index fdf0eaca78..668c61168b 100644 --- a/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_parallel_input_shape.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_parallel_input_shape.struct.toml @@ -12,7 +12,7 @@ features = [ includes = [ "op-attrs/shard_parallel_dim.dtg.h", "op-attrs/datatype.dtg.h", - "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", ] [[fields]] @@ -33,11 +33,11 @@ type = "::FlexFlow::ShardParallelDim" [[fields]] name = "sum_reduction_degree" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "discard_copy_reduction_degree" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "datatype" diff --git a/lib/op-attrs/include/op-attrs/ops/conv_2d_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/conv_2d_attrs.struct.toml index 8b86d42e04..469ce6570e 100644 --- a/lib/op-attrs/include/op-attrs/ops/conv_2d_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/conv_2d_attrs.struct.toml @@ -13,6 +13,7 @@ includes = [ "", "op-attrs/activation.dtg.h", "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", ] src_includes = [ @@ -22,14 +23,14 @@ src_includes = [ ] fields = [ - { name = "out_channels", type = "::FlexFlow::nonnegative_int" }, - { name = "kernel_h", type = "::FlexFlow::nonnegative_int" }, - { name = "kernel_w", type = "::FlexFlow::nonnegative_int" }, - { name = "stride_h", type = "::FlexFlow::nonnegative_int" }, - { name = "stride_w", type = "::FlexFlow::nonnegative_int" }, + { name = "out_channels", type = "::FlexFlow::positive_int" }, + { name = "kernel_h", type = "::FlexFlow::positive_int" }, + { name = "kernel_w", type = "::FlexFlow::positive_int" }, + { name = "stride_h", type = "::FlexFlow::positive_int" }, + { name = "stride_w", type = "::FlexFlow::positive_int" }, { name = "padding_h", type = "::FlexFlow::nonnegative_int" }, { name = "padding_w", type = "::FlexFlow::nonnegative_int" }, - { name = "groups", type = "::FlexFlow::nonnegative_int" }, + { name = "groups", type = "::FlexFlow::positive_int" }, { name = "activation", type = "std::optional<::FlexFlow::Activation>" }, { name = "use_bias", type = "bool" }, ] diff --git a/lib/op-attrs/include/op-attrs/ops/embedding_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/embedding_attrs.struct.toml index 5a857efb3e..07f82883db 100644 --- a/lib/op-attrs/include/op-attrs/ops/embedding_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/embedding_attrs.struct.toml @@ -12,7 +12,7 @@ features = [ includes = [ "op-attrs/aggregate_op.dtg.h", "op-attrs/datatype.dtg.h", - "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", "", ] @@ -24,11 +24,11 @@ src_includes = [ [[fields]] name = "num_entries" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "out_channels" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "aggr" diff --git a/lib/op-attrs/include/op-attrs/ops/linear_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/linear_attrs.struct.toml index ffbe93c975..23513482d3 100644 --- a/lib/op-attrs/include/op-attrs/ops/linear_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/linear_attrs.struct.toml @@ -14,7 +14,7 @@ includes = [ "op-attrs/activation.dtg.h", "op-attrs/regularizer_attrs.dtg.h", "", - "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", ] src_includes = [ @@ -25,7 +25,7 @@ src_includes = [ [[fields]] name = "out_channels" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "use_bias" diff --git a/lib/op-attrs/include/op-attrs/ops/pool_2d.h b/lib/op-attrs/include/op-attrs/ops/pool_2d.h index af11d61f07..368250c957 100644 --- a/lib/op-attrs/include/op-attrs/ops/pool_2d.h +++ b/lib/op-attrs/include/op-attrs/ops/pool_2d.h @@ -13,8 +13,8 @@ CHECK_VALID_OP_ATTR(Pool2DAttrs); tl::expected make_adaptive_pool2d_attrs(TensorDims const &input_dims, - nonnegative_int output_h, - nonnegative_int output_w, + positive_int output_h, + positive_int output_w, PoolOp pool_type, std::optional const &activation); diff --git a/lib/op-attrs/include/op-attrs/ops/pool_2d_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/pool_2d_attrs.struct.toml index fea318d46d..d0005eee19 100644 --- a/lib/op-attrs/include/op-attrs/ops/pool_2d_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/pool_2d_attrs.struct.toml @@ -14,6 +14,7 @@ includes = [ "op-attrs/activation.dtg.h", "", "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", ] src_includes = [ @@ -24,19 +25,19 @@ src_includes = [ [[fields]] name = "kernel_h" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "kernel_w" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "stride_h" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "stride_w" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "padding_h" diff --git a/lib/op-attrs/include/op-attrs/ops/reduction_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/reduction_attrs.struct.toml index 2798a85caf..1ae2dcdc75 100644 --- a/lib/op-attrs/include/op-attrs/ops/reduction_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/reduction_attrs.struct.toml @@ -10,9 +10,9 @@ features = [ ] includes = [ - "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", ] [[fields]] name = "reduction_degree" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" diff --git a/lib/op-attrs/include/op-attrs/ops/repartition_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/repartition_attrs.struct.toml index 965c40c05a..9f08a13fcf 100644 --- a/lib/op-attrs/include/op-attrs/ops/repartition_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/repartition_attrs.struct.toml @@ -12,7 +12,7 @@ features = [ includes = [ "op-attrs/ff_dim_t.h", "op-attrs/ff_dim_t.dtg.h", - "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", ] [[fields]] @@ -21,4 +21,4 @@ type = "::FlexFlow::ff_dim_t" [[fields]] name = "repartition_degree" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" diff --git a/lib/op-attrs/include/op-attrs/ops/replicate_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/replicate_attrs.struct.toml index 58e365c0f2..739f0edfb4 100644 --- a/lib/op-attrs/include/op-attrs/ops/replicate_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/replicate_attrs.struct.toml @@ -10,9 +10,9 @@ features = [ ] includes = [ - "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", ] [[fields]] name = "replicate_degree" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" diff --git a/lib/op-attrs/include/op-attrs/ops/topk_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/topk_attrs.struct.toml index 1c5bfc8e10..8feaff4dc0 100644 --- a/lib/op-attrs/include/op-attrs/ops/topk_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/topk_attrs.struct.toml @@ -10,12 +10,12 @@ features = [ ] includes = [ - "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", ] [[fields]] name = "k" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "sorted" diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml index d68ef02ec1..e25627f709 100644 --- a/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml +++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml @@ -13,7 +13,7 @@ includes = [ "op-attrs/parallel_tensor_shape/sum_degree.dtg.h", "op-attrs/parallel_tensor_shape/discard_copy_degree.dtg.h", "op-attrs/ff_ordered/ff_ordered.h", - "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", ] [[fields]] @@ -26,4 +26,4 @@ type = "::FlexFlow::DiscardCopyDegree" [[fields]] name = "shard_degrees" -type = "::FlexFlow::FFOrdered<::FlexFlow::nonnegative_int>" +type = "::FlexFlow::FFOrdered<::FlexFlow::positive_int>" diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h index 67864e637b..bb374d98ee 100644 --- a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h +++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h @@ -9,7 +9,7 @@ namespace FlexFlow { FFOrdered ff_ordered_shard_dims(ParallelTensorDims const &); -FFOrdered ff_ordered_shard_degrees(ParallelTensorDims const &); +FFOrdered ff_ordered_shard_degrees(ParallelTensorDims const &); std::unordered_set replica_dims(ParallelTensorDims const &); /* size_t get_volume(ParallelTensorDims const &); */ @@ -22,14 +22,14 @@ ParallelTensorDims lift_to_parallel_with_degrees( TensorDims const &, SumDegree const &, DiscardCopyDegree const &, - FFOrdered const &shard_degrees); + FFOrdered const &shard_degrees); ParallelTensorDims lift_to_parallel_with_degrees(TensorDims const &, ParallelTensorDimDegrees const &); -nonnegative_int total_replica_degree(ParallelTensorDims const &); -nonnegative_int total_shard_degree(ParallelTensorDims const &); -nonnegative_int total_parallel_degree(ParallelTensorDims const &); +positive_int total_replica_degree(ParallelTensorDims const &); +positive_int total_shard_degree(ParallelTensorDims const &); +positive_int total_parallel_degree(ParallelTensorDims const &); ShardParallelDim shard_dim_at_idx(ParallelTensorDims const &, relative_ff_dim_t); diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h index d461ffc9e4..96d9bfb06a 100644 --- a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h +++ b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h @@ -17,7 +17,7 @@ ShardParallelDim shard_dim_at_idx(ParallelTensorShape const &, relative_ff_dim_t); ShardParallelDim &shard_dim_at_idx(ParallelTensorShape &, relative_ff_dim_t); -FFOrdered +FFOrdered ff_ordered_shard_degrees(ParallelTensorShape const &); std::optional @@ -30,7 +30,7 @@ ParallelTensorShape lift_to_parallel_with_degrees( TensorShape const &, SumDegree const &, DiscardCopyDegree const &, - FFOrdered const &shard_degrees); + FFOrdered const &shard_degrees); ParallelTensorShape lift_to_parallel_with_degrees(TensorShape const &, ParallelTensorDimDegrees const &); @@ -38,13 +38,13 @@ ParallelTensorShape std::unordered_set replica_dims(ParallelTensorShape const &); TensorShape get_piece_shape(ParallelTensorShape const &); -nonnegative_int get_num_replica_dims(ParallelTensorShape const &); -nonnegative_int get_num_replicas(ParallelTensorShape const &); +positive_int get_num_replica_dims(ParallelTensorShape const &); +positive_int get_num_replicas(ParallelTensorShape const &); -nonnegative_int get_sum_degree(ParallelTensorShape const &); -nonnegative_int get_discard_copy_degree(ParallelTensorShape const &); +positive_int get_sum_degree(ParallelTensorShape const &); +positive_int get_discard_copy_degree(ParallelTensorShape const &); -nonnegative_int get_total_parallel_degree(ParallelTensorShape const &); +positive_int get_total_parallel_degree(ParallelTensorShape const &); bool is_valid(ParallelTensorShape const &); diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_shape/discard_copy_degree.struct.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_shape/discard_copy_degree.struct.toml index 76b52bcdef..d60495bc3a 100644 --- a/lib/op-attrs/include/op-attrs/parallel_tensor_shape/discard_copy_degree.struct.toml +++ b/lib/op-attrs/include/op-attrs/parallel_tensor_shape/discard_copy_degree.struct.toml @@ -10,9 +10,9 @@ features = [ ] includes = [ - "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", ] [[fields]] name = "value" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_shape/sum_degree.struct.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_shape/sum_degree.struct.toml index 550a384ba9..f16586c4c9 100644 --- a/lib/op-attrs/include/op-attrs/parallel_tensor_shape/sum_degree.struct.toml +++ b/lib/op-attrs/include/op-attrs/parallel_tensor_shape/sum_degree.struct.toml @@ -10,9 +10,9 @@ features = [ ] includes = [ - "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", ] [[fields]] name = "value" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" diff --git a/lib/op-attrs/include/op-attrs/replica_parallel_dim.struct.toml b/lib/op-attrs/include/op-attrs/replica_parallel_dim.struct.toml index 5ca486181e..ac4c2563dc 100644 --- a/lib/op-attrs/include/op-attrs/replica_parallel_dim.struct.toml +++ b/lib/op-attrs/include/op-attrs/replica_parallel_dim.struct.toml @@ -11,12 +11,12 @@ features = [ includes = [ "op-attrs/replica_type.dtg.h", - "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", ] [[fields]] name = "degree" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "replica_type" diff --git a/lib/op-attrs/include/op-attrs/replica_parallel_dim_set.h b/lib/op-attrs/include/op-attrs/replica_parallel_dim_set.h index 92d2b0abb2..85cea57523 100644 --- a/lib/op-attrs/include/op-attrs/replica_parallel_dim_set.h +++ b/lib/op-attrs/include/op-attrs/replica_parallel_dim_set.h @@ -8,11 +8,10 @@ namespace FlexFlow { ReplicaParallelDimSet empty_replica_parallel_dim_set(); -nonnegative_int get_degree_of_replica_type(ReplicaParallelDimSet const &, +positive_int get_degree_of_replica_type(ReplicaParallelDimSet const &, ReplicaType); std::unordered_set get_replica_dims(ReplicaParallelDimSet const &); -bool is_valid(ReplicaParallelDimSet const &); } // namespace FlexFlow diff --git a/lib/op-attrs/include/op-attrs/shard_parallel_dim.struct.toml b/lib/op-attrs/include/op-attrs/shard_parallel_dim.struct.toml index 5c5d2dc5b2..a11897070f 100644 --- a/lib/op-attrs/include/op-attrs/shard_parallel_dim.struct.toml +++ b/lib/op-attrs/include/op-attrs/shard_parallel_dim.struct.toml @@ -10,13 +10,13 @@ features = [ ] includes = [ - "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", ] [[fields]] name = "size" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "degree" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" diff --git a/lib/op-attrs/include/op-attrs/tensor_dims.h b/lib/op-attrs/include/op-attrs/tensor_dims.h index ba35295e09..a21602e28c 100644 --- a/lib/op-attrs/include/op-attrs/tensor_dims.h +++ b/lib/op-attrs/include/op-attrs/tensor_dims.h @@ -6,12 +6,12 @@ namespace FlexFlow { -FFOrdered const &ff_ordered(TensorDims const &); +FFOrdered const &ff_ordered(TensorDims const &); nonnegative_int num_dims(TensorDims const &); -nonnegative_int dim_at_idx(TensorDims const &, relative_ff_dim_t); -nonnegative_int &dim_at_idx(TensorDims &, relative_ff_dim_t); -nonnegative_int get_num_elements(TensorDims const &); +positive_int dim_at_idx(TensorDims const &, relative_ff_dim_t); +positive_int &dim_at_idx(TensorDims &, relative_ff_dim_t); +positive_int get_num_elements(TensorDims const &); bool tensor_dims_is_broadcastable_to(TensorDims const &curr, TensorDims const &goal); diff --git a/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml b/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml index 8c6d1098cc..a1039798c9 100644 --- a/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml +++ b/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml @@ -11,9 +11,9 @@ features = [ includes = [ "op-attrs/ff_ordered/ff_ordered.h", - "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", ] [[fields]] name = "ff_ordered" -type = "::FlexFlow::FFOrdered<::FlexFlow::nonnegative_int>" +type = "::FlexFlow::FFOrdered<::FlexFlow::positive_int>" diff --git a/lib/op-attrs/include/op-attrs/tensor_shape.h b/lib/op-attrs/include/op-attrs/tensor_shape.h index 298ea04638..3cafdda4b8 100644 --- a/lib/op-attrs/include/op-attrs/tensor_shape.h +++ b/lib/op-attrs/include/op-attrs/tensor_shape.h @@ -6,10 +6,10 @@ namespace FlexFlow { nonnegative_int num_dims(TensorShape const &); -nonnegative_int dim_at_idx(TensorShape const &, relative_ff_dim_t); -nonnegative_int &dim_at_idx(TensorShape &, relative_ff_dim_t); -nonnegative_int get_num_elements(TensorShape const &); -nonnegative_int get_size_in_bytes(TensorShape const &); +positive_int dim_at_idx(TensorShape const &, relative_ff_dim_t); +positive_int &dim_at_idx(TensorShape &, relative_ff_dim_t); +positive_int get_num_elements(TensorShape const &); +positive_int get_size_in_bytes(TensorShape const &); TensorShape slice_tensor_shape(TensorShape const &, relative_ff_dim_t const &start, diff --git a/lib/op-attrs/src/op-attrs/datatype.cc b/lib/op-attrs/src/op-attrs/datatype.cc index f8791521ab..d9e4a65f13 100644 --- a/lib/op-attrs/src/op-attrs/datatype.cc +++ b/lib/op-attrs/src/op-attrs/datatype.cc @@ -5,20 +5,20 @@ namespace FlexFlow { -nonnegative_int size_of_datatype(DataType data_type) { +positive_int size_of_datatype(DataType data_type) { switch (data_type) { case DataType::BOOL: - return nonnegative_int{sizeof(bool)}; + return positive_int{sizeof(bool)}; case DataType::INT32: - return nonnegative_int{sizeof(int32_t)}; + return positive_int{sizeof(int32_t)}; case DataType::INT64: - return nonnegative_int{sizeof(int64_t)}; + return positive_int{sizeof(int64_t)}; case DataType::HALF: - return nonnegative_int{sizeof(float)} / 2_n; + return positive_int{sizeof(float) / 2}; case DataType::FLOAT: - return nonnegative_int{sizeof(float)}; + return positive_int{sizeof(float)}; case DataType::DOUBLE: - return nonnegative_int{sizeof(double)}; + return positive_int{sizeof(double)}; default: throw mk_runtime_error(fmt::format("Unknown DataType {}", data_type)); } diff --git a/lib/op-attrs/src/op-attrs/datatype_value.cc b/lib/op-attrs/src/op-attrs/datatype_value.cc index 4604ef0b4e..dfb77dac5d 100644 --- a/lib/op-attrs/src/op-attrs/datatype_value.cc +++ b/lib/op-attrs/src/op-attrs/datatype_value.cc @@ -1,4 +1,5 @@ #include "op-attrs/datatype_value.h" +#include "utils/overload.h" namespace FlexFlow { @@ -22,4 +23,14 @@ DataTypeValue make_bool_data_type_value(bool value) { return DataTypeValue{value}; } +DataType get_data_type_of_data_type_value(DataTypeValue value) { + return value.visit(overload { + [](float) { return DataType::FLOAT; }, + [](double) { return DataType::DOUBLE; }, + [](int32_t) { return DataType::INT32; }, + [](int64_t) { return DataType::INT64; }, + [](bool) { return DataType::BOOL; }, + }); +} + } // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/initializer_attrs.cc b/lib/op-attrs/src/op-attrs/initializer_attrs.cc index 7635f170a0..b24b28a339 100644 --- a/lib/op-attrs/src/op-attrs/initializer_attrs.cc +++ b/lib/op-attrs/src/op-attrs/initializer_attrs.cc @@ -10,12 +10,12 @@ InitializerAttrs make_zero_initializer() { // fan_in and fan_out calculation from pytorch // see // https://github.com/pytorch/pytorch/blob/bd019c0bb485904a99fb38589444b1461ab1e486/torch/nn/init.py#L345-L363 -static nonnegative_int calculate_fan_for_mode(TensorDims const &dims, - KaimingInitializerMode mode) { - nonnegative_int num_input_fmaps = dim_at_idx(dims, relative_ff_dim_t{0}); - nonnegative_int num_output_fmaps = dim_at_idx(dims, relative_ff_dim_t{1}); +static positive_int calculate_fan_for_mode(TensorDims const &dims, + KaimingInitializerMode mode) { + positive_int num_input_fmaps = dim_at_idx(dims, relative_ff_dim_t{0}); + positive_int num_output_fmaps = dim_at_idx(dims, relative_ff_dim_t{1}); - nonnegative_int receptive_field_size = get_num_elements( + positive_int receptive_field_size = get_num_elements( slice_tensor_dims(dims, relative_ff_dim_t{2}, std::nullopt)); if (mode == KaimingInitializerMode::FAN_IN) { @@ -52,9 +52,9 @@ InitializerAttrs kaiming_uniform(TensorDims const &dims, KaimingInitializerNonlinearity nonlinearity, int seed) { - nonnegative_int fan = calculate_fan_for_mode(dims, mode); + positive_int fan = calculate_fan_for_mode(dims, mode); float gain = gain_for_nonlinearity(nonlinearity, a); - float std = gain / sqrtf(static_cast(fan.unwrap_nonnegative())); + float std = gain / sqrtf(static_cast(fan.int_from_positive_int())); float bound = sqrtf(3.0) * std; return InitializerAttrs{UniformInitializerAttrs{ diff --git a/lib/op-attrs/src/op-attrs/initializers/kaiming_initializer_mode.cc b/lib/op-attrs/src/op-attrs/initializers/kaiming_initializer_mode.cc index b3d6e93c25..789903dc66 100644 --- a/lib/op-attrs/src/op-attrs/initializers/kaiming_initializer_mode.cc +++ b/lib/op-attrs/src/op-attrs/initializers/kaiming_initializer_mode.cc @@ -3,12 +3,12 @@ namespace FlexFlow { -nonnegative_int calculate_fan_for_mode(TensorDims const &dims, +positive_int calculate_fan_for_mode(TensorDims const &dims, KaimingInitializerMode mode) { - nonnegative_int num_input_fmaps = dim_at_idx(dims, relative_ff_dim_t{0}); - nonnegative_int num_output_fmaps = dim_at_idx(dims, relative_ff_dim_t{1}); + positive_int num_input_fmaps = dim_at_idx(dims, relative_ff_dim_t{0}); + positive_int num_output_fmaps = dim_at_idx(dims, relative_ff_dim_t{1}); - nonnegative_int receptive_field_size = get_num_elements( + positive_int receptive_field_size = get_num_elements( slice_tensor_dims(dims, relative_ff_dim_t{2}, std::nullopt)); if (mode == KaimingInitializerMode::FAN_IN) { diff --git a/lib/op-attrs/src/op-attrs/ops/attention.cc b/lib/op-attrs/src/op-attrs/ops/attention.cc index 07d4f3e287..c5678e7bde 100644 --- a/lib/op-attrs/src/op-attrs/ops/attention.cc +++ b/lib/op-attrs/src/op-attrs/ops/attention.cc @@ -17,82 +17,82 @@ namespace FlexFlow { /* return is_valid; */ /* } */ -nonnegative_int get_qProjSize(MultiHeadAttentionAttrs const &attrs) { +positive_int get_qProjSize(MultiHeadAttentionAttrs const &attrs) { return attrs.kdim; } -nonnegative_int get_vProjSize(MultiHeadAttentionAttrs const &attrs) { +positive_int get_vProjSize(MultiHeadAttentionAttrs const &attrs) { return attrs.vdim; } -nonnegative_int get_kProjSize(MultiHeadAttentionAttrs const &attrs) { +positive_int get_kProjSize(MultiHeadAttentionAttrs const &attrs) { return attrs.kdim; } -nonnegative_int get_oProjSize(MultiHeadAttentionAttrs const &attrs) { +positive_int get_oProjSize(MultiHeadAttentionAttrs const &attrs) { return attrs.embed_dim; } -nonnegative_int get_qSize(TensorShape const &query_shape) { +positive_int get_qSize(TensorShape const &query_shape) { return dim_at_idx(query_shape, relative_ff_dim_t{0}); } -nonnegative_int get_kSize(TensorShape const &key_shape) { +positive_int get_kSize(TensorShape const &key_shape) { return dim_at_idx(key_shape, relative_ff_dim_t{0}); } -nonnegative_int get_vSize(TensorShape const &value_shape) { +positive_int get_vSize(TensorShape const &value_shape) { return dim_at_idx(value_shape, relative_ff_dim_t{0}); } -nonnegative_int get_qSize(MultiHeadAttentionParallelInputs const &inputs) { +positive_int get_qSize(MultiHeadAttentionParallelInputs const &inputs) { return inputs.query_dim.size; } -nonnegative_int get_qSize(MultiHeadAttentionInputs const &inputs) { +positive_int get_qSize(MultiHeadAttentionInputs const &inputs) { return inputs.query_size; } -nonnegative_int get_kSize(MultiHeadAttentionParallelInputs const &inputs) { +positive_int get_kSize(MultiHeadAttentionParallelInputs const &inputs) { return inputs.key_dim.size; } -nonnegative_int get_kSize(MultiHeadAttentionInputs const &inputs) { +positive_int get_kSize(MultiHeadAttentionInputs const &inputs) { return inputs.key_size; } -nonnegative_int get_vSize(MultiHeadAttentionParallelInputs const &inputs) { +positive_int get_vSize(MultiHeadAttentionParallelInputs const &inputs) { return inputs.value_dim.size; } -nonnegative_int get_vSize(MultiHeadAttentionInputs const &inputs) { +positive_int get_vSize(MultiHeadAttentionInputs const &inputs) { return inputs.value_size; } -nonnegative_int +positive_int get_kvSeqLength(MultiHeadAttentionParallelInputs const &inputs) { return inputs.sequence_dim.size; } -nonnegative_int get_kvSeqLength(MultiHeadAttentionInputs const &inputs) { +positive_int get_kvSeqLength(MultiHeadAttentionInputs const &inputs) { return inputs.sequence_length; } -nonnegative_int +positive_int get_qoSeqLength(MultiHeadAttentionParallelInputs const &inputs) { return inputs.sequence_dim.size; // FIXME -- assumes only prefill } -nonnegative_int get_qoSeqLength(MultiHeadAttentionInputs const &inputs) { +positive_int get_qoSeqLength(MultiHeadAttentionInputs const &inputs) { return inputs.sequence_length; // FIXME -- assumes only prefil } -nonnegative_int +positive_int get_num_samples(MultiHeadAttentionParallelInputs const &inputs) { return inputs.batch_dim.size; } -nonnegative_int get_num_samples(MultiHeadAttentionInputs const &inputs) { +positive_int get_num_samples(MultiHeadAttentionInputs const &inputs) { return inputs.batch_size; } @@ -139,7 +139,7 @@ tl::expected MultiHeadAttentionInputs parsed = parse_result.value(); return TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ parsed.batch_size, parsed.sequence_length, attrs.embed_dim, @@ -164,20 +164,20 @@ tl::expected MultiHeadAttentionInputs parsed = parse_result.value(); // W^Q_i in "Attention Is All You Need" top of page 5 - nonnegative_int qProjectWeightSize = parsed.query_size * attrs.kdim; + positive_int qProjectWeightSize = parsed.query_size * attrs.kdim; // W^K_i in "Attention Is All You Need" top of page 5 (all i's put together) - nonnegative_int kProjectWeightSize = parsed.key_size * attrs.kdim; + positive_int kProjectWeightSize = parsed.key_size * attrs.kdim; // W^V_i in "Attention Is All You Need" top of page 5 (all i's put together) - nonnegative_int vProjectWeightSize = parsed.value_size * attrs.vdim; + positive_int vProjectWeightSize = parsed.value_size * attrs.vdim; // W^O in "Attention Is All You Need" top of page 5, with num_heads factored // out - nonnegative_int outWeightSize = attrs.vdim * attrs.embed_dim; + positive_int outWeightSize = attrs.vdim * attrs.embed_dim; return TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ (qProjectWeightSize + kProjectWeightSize + vProjectWeightSize + outWeightSize), attrs.num_heads, @@ -203,7 +203,7 @@ tl::expected }); return TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ attrs.kdim + attrs.kdim + attrs.vdim, }}, parsed.datatype, @@ -227,7 +227,7 @@ tl::expected }); return TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ attrs.embed_dim, }}, parsed.datatype, @@ -278,14 +278,14 @@ tl::expected } TensorShape unpar_shape = result_unpar_get_shape.value(); - nonnegative_int joined_dim_degree = 1_n; - nonnegative_int head_dim_degree = parsed.discard_copy_degree.value; + positive_int joined_dim_degree = 1_p; + positive_int head_dim_degree = parsed.discard_copy_degree.value; return lift_to_parallel_with_degrees( unpar_shape, - SumDegree{1_n}, + SumDegree{1_p}, DiscardCopyDegree{parsed.batch_dim.degree}, - FFOrdered{joined_dim_degree, head_dim_degree}); + FFOrdered{joined_dim_degree, head_dim_degree}); } tl::expected @@ -318,10 +318,10 @@ tl::expected result_unpar.value(); }); - SumDegree sum_degree = SumDegree{1_n}; + SumDegree sum_degree = SumDegree{1_p}; DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{ parsed.batch_dim.degree * parsed.discard_copy_degree.value}; - FFOrdered shard_degrees = FFOrdered{1_n}; + FFOrdered shard_degrees = FFOrdered{1_p}; return lift_to_parallel_with_degrees( unpar_shape, sum_degree, discard_copy_degree, shard_degrees); } @@ -356,10 +356,10 @@ tl::expected result_unpar.value(); }); - SumDegree sum_degree = SumDegree{1_n}; + SumDegree sum_degree = SumDegree{1_p}; DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{ parsed.batch_dim.degree * parsed.discard_copy_degree.value}; - FFOrdered shard_degrees = FFOrdered{1_n}; + FFOrdered shard_degrees = FFOrdered{1_p}; return lift_to_parallel_with_degrees( unpar_shape, sum_degree, discard_copy_degree, shard_degrees); } @@ -388,24 +388,24 @@ tl::expected } TensorShape unpar_shape = result_unpar_get_shape.value(); - nonnegative_int sum_degree = parsed.discard_copy_degree.value; - nonnegative_int discard_copy_degree = 1_n; - nonnegative_int batch_degree = parsed.batch_dim.degree; - nonnegative_int seq_len_degree = 1_n; - nonnegative_int out_dim_degree = 1_n; + positive_int sum_degree = parsed.discard_copy_degree.value; + positive_int discard_copy_degree = 1_p; + positive_int batch_degree = parsed.batch_dim.degree; + positive_int seq_len_degree = 1_p; + positive_int out_dim_degree = 1_p; return lift_to_parallel_with_degrees( unpar_shape, SumDegree{sum_degree}, DiscardCopyDegree{discard_copy_degree}, - FFOrdered{batch_degree, seq_len_degree, out_dim_degree}); + FFOrdered{batch_degree, seq_len_degree, out_dim_degree}); } -nonnegative_int get_oSize(ParallelTensorShape const &) { +positive_int get_oSize(ParallelTensorShape const &) { NOT_IMPLEMENTED(); } -nonnegative_int get_oSize(TensorShape const &) { +positive_int get_oSize(TensorShape const &) { NOT_IMPLEMENTED(); } diff --git a/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc b/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc index b9049bf461..7bf3b9d91e 100644 --- a/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc +++ b/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc @@ -31,9 +31,9 @@ tl::expected 3)); } - nonnegative_int seq_len_q = dim_at_idx(input_q, relative_ff_dim_t{-2}); - nonnegative_int seq_len_k = dim_at_idx(input_k, relative_ff_dim_t{-2}); - nonnegative_int seq_len_v = dim_at_idx(input_v, relative_ff_dim_t{-2}); + positive_int seq_len_q = dim_at_idx(input_q, relative_ff_dim_t{-2}); + positive_int seq_len_k = dim_at_idx(input_k, relative_ff_dim_t{-2}); + positive_int seq_len_v = dim_at_idx(input_v, relative_ff_dim_t{-2}); if (!all_same(seq_len_q, seq_len_k, seq_len_v)) { return tl::unexpected(fmt::format( @@ -43,9 +43,9 @@ tl::expected seq_len_v)); } - nonnegative_int batch_size_q = dim_at_idx(input_q, relative_ff_dim_t{-3}); - nonnegative_int batch_size_k = dim_at_idx(input_k, relative_ff_dim_t{-3}); - nonnegative_int batch_size_v = dim_at_idx(input_v, relative_ff_dim_t{-3}); + positive_int batch_size_q = dim_at_idx(input_q, relative_ff_dim_t{-3}); + positive_int batch_size_k = dim_at_idx(input_k, relative_ff_dim_t{-3}); + positive_int batch_size_v = dim_at_idx(input_v, relative_ff_dim_t{-3}); if (!all_same(batch_size_q, batch_size_k, batch_size_v)) { return tl::unexpected(fmt::format( @@ -63,9 +63,9 @@ tl::expected input_v.data_type)); } - nonnegative_int q_size = dim_at_idx(input_q, relative_ff_dim_t{-1}); - nonnegative_int k_size = dim_at_idx(input_k, relative_ff_dim_t{-1}); - nonnegative_int v_size = dim_at_idx(input_v, relative_ff_dim_t{-1}); + positive_int q_size = dim_at_idx(input_q, relative_ff_dim_t{-1}); + positive_int k_size = dim_at_idx(input_k, relative_ff_dim_t{-1}); + positive_int v_size = dim_at_idx(input_v, relative_ff_dim_t{-1}); return MultiHeadAttentionInputs{ batch_size_q, diff --git a/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_parallel_inputs.cc b/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_parallel_inputs.cc index d69b62b759..3225f1aef2 100644 --- a/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_parallel_inputs.cc +++ b/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_parallel_inputs.cc @@ -107,9 +107,9 @@ tl::expected value_dim.degree)); } - nonnegative_int discard_copy_q = get_discard_copy_degree(input_q); - nonnegative_int discard_copy_k = get_discard_copy_degree(input_k); - nonnegative_int discard_copy_v = get_discard_copy_degree(input_v); + positive_int discard_copy_q = get_discard_copy_degree(input_q); + positive_int discard_copy_k = get_discard_copy_degree(input_k); + positive_int discard_copy_v = get_discard_copy_degree(input_v); if (!all_same(discard_copy_q, discard_copy_k, discard_copy_v)) { return tl::unexpected(fmt::format("Q, K, V disagree on the discard-copy " diff --git a/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc b/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc index d32ae33d14..d11a8aba10 100644 --- a/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc +++ b/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc @@ -57,13 +57,13 @@ tl::expected input_rhs.data_type)); } - nonnegative_int lhs_b = dim_at_idx(input_lhs, relative_ff_dim_t{0}); - nonnegative_int n = dim_at_idx(input_lhs, relative_ff_dim_t{1}); - nonnegative_int lhs_m = dim_at_idx(input_lhs, relative_ff_dim_t{2}); + positive_int lhs_b = dim_at_idx(input_lhs, relative_ff_dim_t{0}); + positive_int n = dim_at_idx(input_lhs, relative_ff_dim_t{1}); + positive_int lhs_m = dim_at_idx(input_lhs, relative_ff_dim_t{2}); - nonnegative_int rhs_b = dim_at_idx(input_rhs, relative_ff_dim_t{0}); - nonnegative_int rhs_m = dim_at_idx(input_rhs, relative_ff_dim_t{1}); - nonnegative_int p = dim_at_idx(input_rhs, relative_ff_dim_t{2}); + positive_int rhs_b = dim_at_idx(input_rhs, relative_ff_dim_t{0}); + positive_int rhs_m = dim_at_idx(input_rhs, relative_ff_dim_t{1}); + positive_int p = dim_at_idx(input_rhs, relative_ff_dim_t{2}); if (lhs_b != rhs_b) { return tl::unexpected( @@ -76,7 +76,7 @@ tl::expected return TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ lhs_b, n, p, @@ -151,10 +151,11 @@ tl::expected ShardParallelDim output_n = n; ShardParallelDim output_p = p; - nonnegative_int output_discard_copy_degree = 1_n; - nonnegative_int output_sum_degree = + positive_int output_discard_copy_degree = 1_p; + positive_int output_sum_degree = positive_int{ get_total_parallel_degree(input_lhs) / - (output_b.degree * output_n.degree * output_p.degree); + (output_b.degree * output_n.degree * output_p.degree) + }; ParallelTensorShape result = ParallelTensorShape{ ParallelTensorDims{ diff --git a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc index ddd92bd417..f42467320b 100644 --- a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc +++ b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc @@ -68,10 +68,10 @@ tl::expected return tl::unexpected("No gamma weights exist for attrs.affine = false"); } - nonnegative_int num_channels = dim_at_idx(input_shape, relative_ff_dim_t{1}); + positive_int num_channels = dim_at_idx(input_shape, relative_ff_dim_t{1}); return TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ num_channels, }}, DataType::FLOAT, @@ -113,23 +113,23 @@ static std::optional input_degrees); } - if (input_degrees.sum_degree != SumDegree{1_n}) { + if (input_degrees.sum_degree != SumDegree{1_p}) { return fmt::format("Expected sum degree 1, but receieved sum degree {}", input_degrees.sum_degree); } - if (input_degrees.discard_copy_degree != DiscardCopyDegree{1_n}) { + if (input_degrees.discard_copy_degree != DiscardCopyDegree{1_p}) { return fmt::format( "Expected discard copy degree 1, but receieved discard copy degree {}", input_degrees.discard_copy_degree); } - FFOrdered non_channel_degrees = + FFOrdered non_channel_degrees = concat(slice(input_degrees.shard_degrees, ff_dim_t{0_n}, ff_dim_t{1_n}), slice(input_degrees.shard_degrees, ff_dim_t{2_n}, std::nullopt)); if (any_of(non_channel_degrees, - [](nonnegative_int degree) { return degree != 1_n; })) { + [](positive_int degree) { return degree != 1_p; })) { return fmt::format("Expected parallel degree of all non-channel dimensions " "to be 1, but received input with degrees {}", input_degrees); @@ -172,9 +172,9 @@ tl::expected relative_ff_dim_t channel_dim = relative_ff_dim_t{1}; return ParallelTensorDimDegrees{ - SumDegree{1_n}, - DiscardCopyDegree{1_n}, - FFOrdered{input_degrees.shard_degrees.at(channel_dim)}, + SumDegree{1_p}, + DiscardCopyDegree{1_p}, + FFOrdered{input_degrees.shard_degrees.at(channel_dim)}, }; } diff --git a/lib/op-attrs/src/op-attrs/ops/combine.cc b/lib/op-attrs/src/op-attrs/ops/combine.cc index 636f37dcea..c55bdc55bb 100644 --- a/lib/op-attrs/src/op-attrs/ops/combine.cc +++ b/lib/op-attrs/src/op-attrs/ops/combine.cc @@ -44,8 +44,10 @@ tl::expected } ParallelTensorShape output = input; - shard_dim_at_idx(output, relative_ff_dim_t_from_ff_dim_t(attrs.combine_dim)) - .degree /= attrs.combine_degree; + relative_ff_dim_t combine_dim = relative_ff_dim_t_from_ff_dim_t(attrs.combine_dim); + shard_dim_at_idx(output, combine_dim).degree = positive_int{ + shard_dim_at_idx(output, combine_dim).degree / attrs.combine_degree + }; return output; } diff --git a/lib/op-attrs/src/op-attrs/ops/concat.cc b/lib/op-attrs/src/op-attrs/ops/concat.cc index bf0ba553e4..b41d1ffc32 100644 --- a/lib/op-attrs/src/op-attrs/ops/concat.cc +++ b/lib/op-attrs/src/op-attrs/ops/concat.cc @@ -17,7 +17,7 @@ tl::expected get_output_shape(ConcatAttrs const &attrs, std::vector const &inputs) { auto get_non_axis_dims = [&](TensorShape const &s) { - std::map dim_sizes = + std::map dim_sizes = enumerate(ff_ordered(s.dims)); dim_sizes.erase(attrs.axis); return dim_sizes; @@ -41,8 +41,8 @@ tl::expected inputs)); } - std::map non_axis_dims = ({ - tl::expected, std::string> returned = + std::map non_axis_dims = ({ + tl::expected, std::string> returned = require_all_same1(transform(inputs, get_non_axis_dims)); if (!returned.has_value()) { return tl::unexpected(returned.error()); @@ -50,12 +50,12 @@ tl::expected returned.value(); }); - std::vector axis_dim_sizes = + std::vector axis_dim_sizes = transform(inputs, [&](TensorShape const &s) { return dim_at_idx(s, relative_ff_dim_t_from_ff_dim_t(attrs.axis)); }); - nonnegative_int output_axis_dim_size = sum(axis_dim_sizes); + positive_int output_axis_dim_size = sum(axis_dim_sizes); non_axis_dims.insert({attrs.axis, output_axis_dim_size}); @@ -89,7 +89,7 @@ tl::expected }); SumDegree sum_degree = ({ - tl::expected returned = + tl::expected returned = require_all_same1(transform(inputs, get_sum_degree)); if (!returned.has_value()) { return tl::unexpected(returned.error()); @@ -98,7 +98,7 @@ tl::expected }); DiscardCopyDegree discard_copy_degree = ({ - tl::expected returned = + tl::expected returned = require_all_same1(transform(inputs, get_discard_copy_degree)); if (!returned.has_value()) { return tl::unexpected(returned.error()); diff --git a/lib/op-attrs/src/op-attrs/ops/conv_2d.cc b/lib/op-attrs/src/op-attrs/ops/conv_2d.cc index 902417d050..af4b6cd898 100644 --- a/lib/op-attrs/src/op-attrs/ops/conv_2d.cc +++ b/lib/op-attrs/src/op-attrs/ops/conv_2d.cc @@ -27,7 +27,7 @@ TensorShape get_kernel_shape(Conv2DAttrs const &attrs, Conv2DInputShape input = parse_input_shape(raw_input_shape); return TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ attrs.out_channels, input.num_channels, attrs.kernel_h, @@ -44,22 +44,22 @@ TensorShape get_bias_shape(Conv2DAttrs const &attrs, return TensorShape{ TensorDims{ - FFOrdered{attrs.out_channels}, + FFOrdered{attrs.out_channels}, }, input.datatype, }; } -static nonnegative_int calculate_output_size(nonnegative_int input_size, +static positive_int calculate_output_size(positive_int input_size, nonnegative_int padding_size, - nonnegative_int kernel_size, - nonnegative_int stride) { - int input_size_raw = input_size.unwrap_nonnegative(); + positive_int kernel_size, + positive_int stride) { + int input_size_raw = input_size.int_from_positive_int(); int padding_raw = padding_size.unwrap_nonnegative(); - int kernel_size_raw = kernel_size.unwrap_nonnegative(); - int stride_raw = stride.unwrap_nonnegative(); + int kernel_size_raw = kernel_size.int_from_positive_int(); + int stride_raw = stride.int_from_positive_int(); - return nonnegative_int{ + return positive_int{ (input_size_raw + (2 * padding_raw) - kernel_size_raw) / stride_raw + 1}; } @@ -68,18 +68,18 @@ TensorShape get_output_shape(Conv2DAttrs const &attrs, assert(attrs.groups == 1); // TODO(@lockshaw): currently not supported Conv2DInputShape input = parse_input_shape(raw_input_shape); - nonnegative_int out_height = + positive_int out_height = calculate_output_size(/*input_size=*/input.height, /*padding_size=*/attrs.padding_h, /*kernel_size=*/attrs.kernel_h, /*stride_size=*/attrs.stride_h); - nonnegative_int out_width = + positive_int out_width = calculate_output_size(/*input_size=*/input.width, /*padding_size=*/attrs.padding_w, /*kernel_size=*/attrs.kernel_w, /*stride_size=*/attrs.stride_w); - return TensorShape{TensorDims{FFOrdered{ + return TensorShape{TensorDims{FFOrdered{ input.num_samples, attrs.out_channels, out_height, @@ -112,14 +112,14 @@ ParallelTensorShape get_kernel_shape(Conv2DAttrs const &attrs, assert(parsed.height_dim.degree == 1); assert(parsed.width_dim.degree == 1); - SumDegree sum_degree = SumDegree{1_n}; + SumDegree sum_degree = SumDegree{1_p}; DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{parsed.sample_dim.degree * parsed.sum_reduction_degree}; - FFOrdered shard_degrees = { + FFOrdered shard_degrees = { parsed.discard_copy_reduction_degree, parsed.channel_dim.degree, - 1_n, - 1_n, + 1_p, + 1_p, }; return lift_to_parallel_with_degrees( @@ -139,7 +139,7 @@ ParallelTensorShape get_bias_shape(Conv2DAttrs const &attrs, DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{parsed.height_dim.degree * parsed.width_dim.degree * parsed.sample_dim.degree}; - FFOrdered shard_degrees = { + FFOrdered shard_degrees = { parsed.discard_copy_reduction_degree, }; @@ -160,12 +160,12 @@ ParallelTensorShape get_output_shape(Conv2DAttrs const &attrs, SumDegree sum_degree = SumDegree{parsed.sum_reduction_degree * parsed.channel_dim.degree}; - DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1_n}; - FFOrdered shard_degrees = { + DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1_p}; + FFOrdered shard_degrees = { parsed.sample_dim.degree, parsed.discard_copy_reduction_degree, - 1_n, - 1_n, + 1_p, + 1_p, }; return lift_to_parallel_with_degrees( @@ -217,11 +217,10 @@ std::vector InitializerAttrs kernel_initializer = maybe_kernel_initializer.value_or(kernel_default_initializer); - nonnegative_int fan_in = + positive_int fan_in = calculate_fan_for_mode(kernel_shape.dims, KaimingInitializerMode::FAN_IN); - assert(fan_in != 0_n); - float bound = 1 / sqrtf(static_cast(fan_in.unwrap_nonnegative())); + float bound = 1 / sqrtf(static_cast(fan_in.int_from_positive_int())); InitializerAttrs bias_default_initializer = InitializerAttrs{UniformInitializerAttrs{ diff --git a/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc b/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc index 1491410491..75db5c56fb 100644 --- a/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc +++ b/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc @@ -6,10 +6,10 @@ namespace FlexFlow { Conv2DInputShape parse_input_shape(TensorShape const &input) { assert(num_dims(input) == 4); - nonnegative_int num_samples = dim_at_idx(input, relative_ff_dim_t{0}); - nonnegative_int in_channels = dim_at_idx(input, relative_ff_dim_t{1}); - nonnegative_int in_height = dim_at_idx(input, relative_ff_dim_t{2}); - nonnegative_int in_width = dim_at_idx(input, relative_ff_dim_t{3}); + positive_int num_samples = dim_at_idx(input, relative_ff_dim_t{0}); + positive_int in_channels = dim_at_idx(input, relative_ff_dim_t{1}); + positive_int in_height = dim_at_idx(input, relative_ff_dim_t{2}); + positive_int in_width = dim_at_idx(input, relative_ff_dim_t{3}); return Conv2DInputShape{ num_samples, diff --git a/lib/op-attrs/src/op-attrs/ops/embedding.cc b/lib/op-attrs/src/op-attrs/ops/embedding.cc index 5b5b91a8e7..809b4cdaf9 100644 --- a/lib/op-attrs/src/op-attrs/ops/embedding.cc +++ b/lib/op-attrs/src/op-attrs/ops/embedding.cc @@ -68,7 +68,7 @@ tl::expected return TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ attrs.num_entries, attrs.out_channels, }, @@ -92,8 +92,8 @@ tl::expected SumDegree sum_degree = SumDegree{shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree}; - DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1_n}; - FFOrdered shard_degrees = + DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1_p}; + FFOrdered shard_degrees = transform(input.dims.shard_dims, [](ShardParallelDim const &d) { return d.degree; }); shard_degrees.at(relative_ff_dim_t{-1}) = get_discard_copy_degree(input); @@ -114,13 +114,13 @@ tl::expected result_unpar.value(); }); - SumDegree sum_degree = SumDegree{1_n}; + SumDegree sum_degree = SumDegree{1_p}; DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{product(transform( ff_ordered_shard_dims(input.dims), - [](ShardParallelDim const &d) -> nonnegative_int { return d.degree; }))}; - nonnegative_int entry_dim_degree = 1_n; - nonnegative_int out_channel_degree = get_discard_copy_degree(input); - FFOrdered shard_degrees = { + [](ShardParallelDim const &d) -> positive_int { return d.degree; }))}; + positive_int entry_dim_degree = 1_p; + positive_int out_channel_degree = get_discard_copy_degree(input); + FFOrdered shard_degrees = { entry_dim_degree, out_channel_degree, }; diff --git a/lib/op-attrs/src/op-attrs/ops/flat.cc b/lib/op-attrs/src/op-attrs/ops/flat.cc index b4eeda76ab..a2183a71b4 100644 --- a/lib/op-attrs/src/op-attrs/ops/flat.cc +++ b/lib/op-attrs/src/op-attrs/ops/flat.cc @@ -11,11 +11,11 @@ namespace FlexFlow { TensorShape get_output_shape(FlatAttrs const &attrs, TensorShape const &input_shape) { - FFOrdered leading_dims = + FFOrdered leading_dims = slice(ff_ordered(input_shape.dims), ff_dim_t{0_n}, attrs.start_dim); - FFOrdered flattened_dims = + FFOrdered flattened_dims = slice(ff_ordered(input_shape.dims), attrs.start_dim, attrs.end_dim); - FFOrdered trailing_dims = + FFOrdered trailing_dims = slice(ff_ordered(input_shape.dims), attrs.end_dim, std::nullopt); if (flattened_dims.empty()) { @@ -37,7 +37,7 @@ TensorShape get_output_shape(FlatAttrs const &attrs, tl::expected get_output_parallel_dim_degrees( FlatAttrs const &attrs, ParallelTensorDimDegrees const &input_degrees) { - FFOrdered flattened_dim_degrees = + FFOrdered flattened_dim_degrees = slice(input_degrees.shard_degrees, attrs.start_dim, attrs.end_dim); if (flattened_dim_degrees.empty()) { @@ -45,7 +45,7 @@ tl::expected } if (any_of(flattened_dim_degrees, - [](nonnegative_int degree) { return degree != 1; })) { + [](positive_int degree) { return degree != 1; })) { return tl::unexpected( fmt::format("get_output_parallel_dim_degrees for {} expected all shard " "degrees of flattened dimensions to be 1, but received {}", diff --git a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc index c9798368e2..3637aacc5c 100644 --- a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc +++ b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc @@ -72,7 +72,7 @@ tl::expected std::vector non_layer_norm_dim_idxs = filter( get_idxs(input_shape.dims.ff_ordered), [&](ff_dim_t const &dim_idx) { return !contains(attrs.axes, dim_idx); }); - std::vector raw_weight_dims = + std::vector raw_weight_dims = transform(non_layer_norm_dim_idxs, [&](ff_dim_t const &dim_idx) { return dim_at_idx(input_shape, relative_ff_dim_t_from_ff_dim_t(dim_idx)); @@ -190,8 +190,8 @@ tl::expected ParallelTensorDims{ ff_ordered_of(raw_weight_shard_dims), ReplicaParallelDimSet{ - SumDegree{1_n}, - DiscardCopyDegree{1_n}, + SumDegree{1_p}, + DiscardCopyDegree{1_p}, }, }, DataType::FLOAT, diff --git a/lib/op-attrs/src/op-attrs/ops/linear.cc b/lib/op-attrs/src/op-attrs/ops/linear.cc index bee9d0cf4f..578e9ce652 100644 --- a/lib/op-attrs/src/op-attrs/ops/linear.cc +++ b/lib/op-attrs/src/op-attrs/ops/linear.cc @@ -44,11 +44,11 @@ RecordFormatter as_dot(LinearAttrs const &attrs) { tl::expected get_projection_shape(LinearAttrs const &attrs, TensorShape const &input_shape) { - nonnegative_int in_channels = dim_at_idx(input_shape, relative_ff_dim_t{-1}); + positive_int in_channels = dim_at_idx(input_shape, relative_ff_dim_t{-1}); return TensorShape{ TensorDims{ - FFOrdered{in_channels, attrs.out_channels}, + FFOrdered{in_channels, attrs.out_channels}, }, input_shape.data_type, }; @@ -58,7 +58,7 @@ tl::expected get_bias_shape(LinearAttrs const &attrs, TensorShape const &input_shape) { return TensorShape{ TensorDims{ - FFOrdered{attrs.out_channels}, + FFOrdered{attrs.out_channels}, }, input_shape.data_type, }; @@ -99,12 +99,12 @@ tl::expected result_unpar.value(); }); - SumDegree sum_degree = SumDegree{1_n}; + SumDegree sum_degree = SumDegree{1_p}; DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{ get_sum_degree(input) * product(slice(ff_ordered_shard_degrees(input), relative_ff_dim_t{0}, relative_ff_dim_t{-1}))}; - FFOrdered shard_degrees = FFOrdered{ + FFOrdered shard_degrees = FFOrdered{ shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree, get_discard_copy_degree(input), }; @@ -131,8 +131,8 @@ tl::expected DiscardCopyDegree{product(slice(ff_ordered_shard_degrees(input), relative_ff_dim_t{0}, relative_ff_dim_t{-1}))}; - FFOrdered shard_degrees = - FFOrdered{get_discard_copy_degree(input)}; + FFOrdered shard_degrees = + FFOrdered{get_discard_copy_degree(input)}; return lift_to_parallel_with_degrees( unpar, sum_degree, discard_copy_degree, shard_degrees); @@ -153,8 +153,8 @@ tl::expected SumDegree sum_degree = SumDegree{get_sum_degree(input) * shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree}; - DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1_n}; - FFOrdered shard_degrees = ff_ordered_shard_degrees(input); + DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1_p}; + FFOrdered shard_degrees = ff_ordered_shard_degrees(input); shard_degrees.at(relative_ff_dim_t{-1}) = get_discard_copy_degree(input); return lift_to_parallel_with_degrees( @@ -209,10 +209,10 @@ tl::expected, std::string> get_initializers( InitializerAttrs projection_initializer = maybe_projection_initializer.value_or(projection_default_initializer); - nonnegative_int fan_in = calculate_fan_for_mode( + positive_int fan_in = calculate_fan_for_mode( projection_shape.dims, KaimingInitializerMode::FAN_IN); - float bound = 1 / sqrtf(static_cast(fan_in.unwrap_nonnegative())); + float bound = 1 / sqrtf(static_cast(fan_in.int_from_positive_int())); InitializerAttrs bias_default_initializer = InitializerAttrs{UniformInitializerAttrs{ diff --git a/lib/op-attrs/src/op-attrs/ops/pool_2d.cc b/lib/op-attrs/src/op-attrs/ops/pool_2d.cc index f9630e16b1..c542d688b3 100644 --- a/lib/op-attrs/src/op-attrs/ops/pool_2d.cc +++ b/lib/op-attrs/src/op-attrs/ops/pool_2d.cc @@ -8,8 +8,8 @@ namespace FlexFlow { tl::expected make_adaptive_pool2d_attrs(TensorDims const &input_dims, - nonnegative_int output_h, - nonnegative_int output_w, + positive_int output_h, + positive_int output_w, PoolOp pool_type, std::optional const &activation) { // AdaptivePool2D semantics pulled from @@ -22,10 +22,10 @@ tl::expected input_dims)); } - nonnegative_int num_samples = dim_at_idx(input_dims, relative_ff_dim_t{0}); - nonnegative_int num_channels = dim_at_idx(input_dims, relative_ff_dim_t{1}); - nonnegative_int input_h = dim_at_idx(input_dims, relative_ff_dim_t{2}); - nonnegative_int input_w = dim_at_idx(input_dims, relative_ff_dim_t{3}); + positive_int num_samples = dim_at_idx(input_dims, relative_ff_dim_t{0}); + positive_int num_channels = dim_at_idx(input_dims, relative_ff_dim_t{1}); + positive_int input_h = dim_at_idx(input_dims, relative_ff_dim_t{2}); + positive_int input_w = dim_at_idx(input_dims, relative_ff_dim_t{3}); if (input_h % output_h != 0) { return tl::unexpected(fmt::format( @@ -55,11 +55,11 @@ tl::expected // = `ind / outd` // = `stride` - nonnegative_int kernel_h = input_h / output_h; - nonnegative_int kernel_w = input_w / output_w; + positive_int kernel_h = positive_int{input_h / output_h}; + positive_int kernel_w = positive_int{input_w / output_w}; - nonnegative_int stride_h = kernel_h; - nonnegative_int stride_w = kernel_w; + positive_int stride_h = kernel_h; + positive_int stride_w = kernel_w; Pool2DAttrs attrs = Pool2DAttrs{ /*kernel_h=*/kernel_h, @@ -73,7 +73,7 @@ tl::expected }; TensorShape expected_ouput_shape = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ num_samples, num_channels, output_h, @@ -104,16 +104,16 @@ tl::expected return attrs; } -static nonnegative_int calculate_output_size(nonnegative_int input_size, +static positive_int calculate_output_size(positive_int input_size, nonnegative_int padding_size, - nonnegative_int kernel_size, - nonnegative_int stride) { - int input_size_raw = input_size.unwrap_nonnegative(); + positive_int kernel_size, + positive_int stride) { + int input_size_raw = input_size.int_from_positive_int(); int padding_raw = padding_size.unwrap_nonnegative(); - int kernel_size_raw = kernel_size.unwrap_nonnegative(); - int stride_raw = stride.unwrap_nonnegative(); + int kernel_size_raw = kernel_size.int_from_positive_int(); + int stride_raw = stride.int_from_positive_int(); - return nonnegative_int{ + return positive_int{ (input_size_raw + (2 * padding_raw) - kernel_size_raw) / stride_raw + 1}; } @@ -126,23 +126,23 @@ tl::expected input_shape)); } - nonnegative_int num_samples = dim_at_idx(input_shape, relative_ff_dim_t{0}); - nonnegative_int num_channels = dim_at_idx(input_shape, relative_ff_dim_t{1}); - nonnegative_int input_height = dim_at_idx(input_shape, relative_ff_dim_t{2}); - nonnegative_int input_width = dim_at_idx(input_shape, relative_ff_dim_t{3}); + positive_int num_samples = dim_at_idx(input_shape, relative_ff_dim_t{0}); + positive_int num_channels = dim_at_idx(input_shape, relative_ff_dim_t{1}); + positive_int input_height = dim_at_idx(input_shape, relative_ff_dim_t{2}); + positive_int input_width = dim_at_idx(input_shape, relative_ff_dim_t{3}); - nonnegative_int output_height = + positive_int output_height = calculate_output_size(/*input_size=*/input_height, /*padding_size=*/attrs.padding_h, /*kernel_size=*/attrs.kernel_h, /*stride_size=*/attrs.stride_h); - nonnegative_int output_width = + positive_int output_width = calculate_output_size(/*input_size=*/input_width, /*padding_size=*/attrs.padding_w, /*kernel_size=*/attrs.kernel_w, /*stride_size=*/attrs.stride_w); - return TensorShape{TensorDims{FFOrdered{ + return TensorShape{TensorDims{FFOrdered{ num_samples, num_channels, output_height, diff --git a/lib/op-attrs/src/op-attrs/ops/reduction.cc b/lib/op-attrs/src/op-attrs/ops/reduction.cc index 0a9f3e3b97..007559a816 100644 --- a/lib/op-attrs/src/op-attrs/ops/reduction.cc +++ b/lib/op-attrs/src/op-attrs/ops/reduction.cc @@ -29,7 +29,10 @@ tl::expected } ParallelTensorShape output_shape = input_shape; - output_shape.dims.replica_dims.sum_degree.value /= attrs.reduction_degree; + + output_shape.dims.replica_dims.sum_degree.value = positive_int{ + output_shape.dims.replica_dims.sum_degree.value / attrs.reduction_degree + }; return output_shape; } diff --git a/lib/op-attrs/src/op-attrs/ops/weight.cc b/lib/op-attrs/src/op-attrs/ops/weight.cc index 906d2c58d0..710529af0a 100644 --- a/lib/op-attrs/src/op-attrs/ops/weight.cc +++ b/lib/op-attrs/src/op-attrs/ops/weight.cc @@ -6,7 +6,7 @@ namespace FlexFlow { RecordFormatter as_dot(WeightAttrs const &attrs) { RecordFormatter r; - for (nonnegative_int dim : attrs.tensor_shape.dims.ff_ordered) { + for (positive_int dim : attrs.tensor_shape.dims.ff_ordered) { r << fmt::to_string(dim); } diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc index 3f2245b2dc..8a96bc25ba 100644 --- a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc +++ b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc @@ -19,7 +19,7 @@ FFOrdered ff_ordered_shard_dims(ParallelTensorDims const &d) { return d.shard_dims; } -FFOrdered +FFOrdered ff_ordered_shard_degrees(ParallelTensorDims const &d) { return transform(d.shard_dims, [](ShardParallelDim const &d) { return d.degree; }); @@ -43,22 +43,22 @@ ParallelTensorDimDegrees get_parallel_degrees(ParallelTensorDims const &d) { } ParallelTensorDims lift_to_parallel(TensorDims const &dims) { - std::vector shard_degrees = - repeat_element(/*num_times=*/num_dims(dims), /*element=*/1_n); + std::vector shard_degrees = + repeat_element(/*num_times=*/num_dims(dims), /*element=*/1_p); return lift_to_parallel_with_degrees( - dims, SumDegree{1_n}, DiscardCopyDegree{1_n}, shard_degrees); + dims, SumDegree{1_p}, DiscardCopyDegree{1_p}, shard_degrees); } ParallelTensorDims lift_to_parallel_with_degrees( TensorDims const &unpar, SumDegree const &sum_degree, DiscardCopyDegree const &discard_copy_degree, - FFOrdered const &shard_degrees) { + FFOrdered const &shard_degrees) { std::vector lifted = transform(zip(vector_of(unpar.ff_ordered), vector_of(shard_degrees)), - [](std::pair const &p) { - nonnegative_int size = p.first; - nonnegative_int degree = p.second; + [](std::pair const &p) { + positive_int size = p.first; + positive_int degree = p.second; return ShardParallelDim{size, degree}; }); @@ -78,17 +78,17 @@ ParallelTensorDims degrees.shard_degrees); } -nonnegative_int total_replica_degree(ParallelTensorDims const &dims) { +positive_int total_replica_degree(ParallelTensorDims const &dims) { return dims.replica_dims.discard_copy_degree.value * dims.replica_dims.sum_degree.value; } -nonnegative_int total_shard_degree(ParallelTensorDims const &dims) { +positive_int total_shard_degree(ParallelTensorDims const &dims) { return product(transform(vector_of(dims.shard_dims), [](ShardParallelDim const &d) { return d.degree; })); } -nonnegative_int total_parallel_degree(ParallelTensorDims const &dims) { +positive_int total_parallel_degree(ParallelTensorDims const &dims) { return total_replica_degree(dims) * total_shard_degree(dims); } @@ -118,7 +118,7 @@ TensorDims get_tensor_dims_unsafe(ParallelTensorDims const &) { } TensorDims get_reduced_dims(ParallelTensorDims const &dims) { - FFOrdered dim_sizes = transform( + FFOrdered dim_sizes = transform( dims.shard_dims, [](ShardParallelDim const &d) { return d.size; }); return TensorDims{dim_sizes}; } diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc index 260ec7c3cd..ff6debee4f 100644 --- a/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc +++ b/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc @@ -20,21 +20,21 @@ std::unordered_set return replica_dims(s.dims); } -nonnegative_int get_num_replicas(ParallelTensorShape const &shape) { +positive_int get_num_replicas(ParallelTensorShape const &shape) { return product(transform( replica_dims(shape), - [](ReplicaParallelDim const &d) -> nonnegative_int { return d.degree; })); + [](ReplicaParallelDim const &d) -> positive_int { return d.degree; })); } -nonnegative_int get_sum_degree(ParallelTensorShape const &shape) { +positive_int get_sum_degree(ParallelTensorShape const &shape) { return shape.dims.replica_dims.sum_degree.value; } -nonnegative_int get_discard_copy_degree(ParallelTensorShape const &shape) { +positive_int get_discard_copy_degree(ParallelTensorShape const &shape) { return shape.dims.replica_dims.discard_copy_degree.value; } -nonnegative_int get_total_parallel_degree(ParallelTensorShape const &s) { +positive_int get_total_parallel_degree(ParallelTensorShape const &s) { return total_parallel_degree(s.dims); } @@ -52,7 +52,7 @@ ShardParallelDim &shard_dim_at_idx(ParallelTensorShape &s, return shard_dim_at_idx(s.dims, d); } -FFOrdered +FFOrdered ff_ordered_shard_degrees(ParallelTensorShape const &s) { return ff_ordered_shard_degrees(s.dims); } @@ -79,7 +79,7 @@ ParallelTensorShape lift_to_parallel_with_degrees( TensorShape const &unpar, SumDegree const &sum_degree, DiscardCopyDegree const &discard_copy_degree, - FFOrdered const &shard_degrees) { + FFOrdered const &shard_degrees) { return ParallelTensorShape{ lift_to_parallel_with_degrees( unpar.dims, sum_degree, discard_copy_degree, shard_degrees), @@ -97,8 +97,8 @@ ParallelTensorShape } TensorShape require_not_parallel(ParallelTensorShape const &s) { - nonnegative_int total_degree = get_total_parallel_degree(s); - if (total_degree != 1_n) { + positive_int total_degree = get_total_parallel_degree(s); + if (total_degree != 1_p) { throw mk_runtime_error( fmt::format("Error: require_not_parallel received a parallel tensor " "shape with parallel degree {}: {}", @@ -132,7 +132,7 @@ ParallelDim get_parallel_dim_at_idx(ParallelTensorShape const &shape, }, [&](ReplicaType replica_type) { ReplicaParallelDimSet replicas = shape.dims.replica_dims; - nonnegative_int degree = (ReplicaType::SUM == replica_type + positive_int degree = (ReplicaType::SUM == replica_type ? replicas.sum_degree.value : replicas.discard_copy_degree.value); return ParallelDim{ReplicaParallelDim{degree, replica_type}}; diff --git a/lib/op-attrs/src/op-attrs/replica_parallel_dim_set.cc b/lib/op-attrs/src/op-attrs/replica_parallel_dim_set.cc index fc712be10b..41fb988bf7 100644 --- a/lib/op-attrs/src/op-attrs/replica_parallel_dim_set.cc +++ b/lib/op-attrs/src/op-attrs/replica_parallel_dim_set.cc @@ -4,10 +4,10 @@ namespace FlexFlow { ReplicaParallelDimSet empty_replica_parallel_dim_set() { - return ReplicaParallelDimSet{SumDegree{1_n}, DiscardCopyDegree{1_n}}; + return ReplicaParallelDimSet{SumDegree{1_p}, DiscardCopyDegree{1_p}}; } -nonnegative_int get_degree_of_replica_type(ReplicaParallelDimSet const &s, +positive_int get_degree_of_replica_type(ReplicaParallelDimSet const &s, ReplicaType replica_type) { switch (replica_type) { case ReplicaType::SUM: @@ -29,8 +29,4 @@ std::unordered_set }; } -bool is_valid(ReplicaParallelDimSet const &s) { - return s.sum_degree.value > 0 && s.discard_copy_degree.value > 0; -} - } // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/tensor_dims.cc b/lib/op-attrs/src/op-attrs/tensor_dims.cc index 760278297c..b48a23b281 100644 --- a/lib/op-attrs/src/op-attrs/tensor_dims.cc +++ b/lib/op-attrs/src/op-attrs/tensor_dims.cc @@ -14,7 +14,7 @@ namespace FlexFlow { -FFOrdered const &ff_ordered(TensorDims const &dims) { +FFOrdered const &ff_ordered(TensorDims const &dims) { return dims.ff_ordered; } @@ -22,15 +22,15 @@ nonnegative_int num_dims(TensorDims const &dims) { return num_elements(dims.ff_ordered); } -nonnegative_int dim_at_idx(TensorDims const &dims, relative_ff_dim_t idx) { +positive_int dim_at_idx(TensorDims const &dims, relative_ff_dim_t idx) { return dims.ff_ordered.at(idx); } -nonnegative_int &dim_at_idx(TensorDims &dims, relative_ff_dim_t idx) { +positive_int &dim_at_idx(TensorDims &dims, relative_ff_dim_t idx) { return dims.ff_ordered.at(idx); } -nonnegative_int get_num_elements(TensorDims const &d) { +positive_int get_num_elements(TensorDims const &d) { return product(d.ff_ordered); } @@ -40,8 +40,8 @@ bool tensor_dims_is_broadcastable_to(TensorDims const &curr, return false; } - std::vector curr_dims = vector_of(curr.ff_ordered); - std::vector goal_dims = vector_of(goal.ff_ordered); + std::vector curr_dims = vector_of(curr.ff_ordered); + std::vector goal_dims = vector_of(goal.ff_ordered); for (auto const &[curr_dim, goal_dim] : zip(reversed(curr_dims), reversed(goal_dims))) { diff --git a/lib/op-attrs/src/op-attrs/tensor_shape.cc b/lib/op-attrs/src/op-attrs/tensor_shape.cc index afc14af54c..7a1ba810a7 100644 --- a/lib/op-attrs/src/op-attrs/tensor_shape.cc +++ b/lib/op-attrs/src/op-attrs/tensor_shape.cc @@ -12,19 +12,19 @@ nonnegative_int num_dims(TensorShape const &s) { return num_elements(s.dims.ff_ordered); } -nonnegative_int dim_at_idx(TensorShape const &s, relative_ff_dim_t idx) { +positive_int dim_at_idx(TensorShape const &s, relative_ff_dim_t idx) { return dim_at_idx(s.dims, idx); } -nonnegative_int &dim_at_idx(TensorShape &s, relative_ff_dim_t idx) { +positive_int &dim_at_idx(TensorShape &s, relative_ff_dim_t idx) { return dim_at_idx(s.dims, idx); } -nonnegative_int get_num_elements(TensorShape const &s) { +positive_int get_num_elements(TensorShape const &s) { return get_num_elements(s.dims); } -nonnegative_int get_size_in_bytes(TensorShape const &s) { +positive_int get_size_in_bytes(TensorShape const &s) { return get_num_elements(s) * size_of_datatype(s.data_type); } diff --git a/lib/op-attrs/test/src/op-attrs/ops/attention.cc b/lib/op-attrs/test/src/op-attrs/ops/attention.cc index b317c5c69c..a99fe167c7 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/attention.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/attention.cc @@ -10,10 +10,10 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_attention_incoming_tensor_roles(MultiHeadAttentionAttrs)") { auto make_attrs = [](bool bias) { return MultiHeadAttentionAttrs{ - /*embed_dim=*/32_n, - /*num_heads=*/10_n, - /*kdim=*/32_n, - /*vdim=*/32_n, + /*embed_dim=*/32_p, + /*num_heads=*/10_p, + /*kdim=*/32_p, + /*vdim=*/32_p, /*dropout=*/0.0, /*bias=*/bias, /*add_bias_kv=*/false, @@ -58,8 +58,8 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_output_shape(MultiHeadAttentionAttrs, TensorShape, " "TensorShape, TensorShape)") { - nonnegative_int embed_dim = 32_n; - nonnegative_int num_heads = 10_n; + positive_int embed_dim = 32_p; + positive_int num_heads = 10_p; /* Parameter meanings match those at * https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html @@ -75,13 +75,13 @@ TEST_SUITE(FF_TEST_SUITE) { /*add_zero_attn=*/false, }; - nonnegative_int batch_size = 40_n; - nonnegative_int seq_len = 48_n; - nonnegative_int feature_size = 36_n; + positive_int batch_size = 40_p; + positive_int seq_len = 48_p; + positive_int feature_size = 36_p; TensorShape input_q = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ batch_size, seq_len, feature_size, @@ -92,7 +92,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_k = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ batch_size, seq_len, feature_size, @@ -103,7 +103,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_v = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ batch_size, seq_len, feature_size, @@ -114,7 +114,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape output = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ batch_size, seq_len, attrs.embed_dim, @@ -125,8 +125,8 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape weights = TensorShape{ TensorDims{ - FFOrdered{ - (feature_size * embed_dim) * 3_n + (embed_dim * embed_dim), + FFOrdered{ + (feature_size * embed_dim) * 3_p + (embed_dim * embed_dim), num_heads, }, }, @@ -135,8 +135,8 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_bias = TensorShape{ TensorDims{ - FFOrdered{ - embed_dim * 3_n, + FFOrdered{ + embed_dim * 3_p, }, }, DataType::FLOAT, @@ -144,7 +144,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape output_bias = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ embed_dim, }, }, @@ -184,94 +184,94 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("parallel shape inference") { auto make_q = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o_batch, - nonnegative_int o_seq_len, - nonnegative_int o_q) { + positive_int o_batch, + positive_int o_seq_len, + positive_int o_q) { return lift_to_parallel_with_degrees( input_q, o_sum, o_eq, - FFOrdered{o_batch, o_seq_len, o_q}); + FFOrdered{o_batch, o_seq_len, o_q}); }; auto make_k = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o_batch, - nonnegative_int o_seq_len, - nonnegative_int o_k) { + positive_int o_batch, + positive_int o_seq_len, + positive_int o_k) { return lift_to_parallel_with_degrees( input_k, o_sum, o_eq, - FFOrdered{o_batch, o_seq_len, o_k}); + FFOrdered{o_batch, o_seq_len, o_k}); }; auto make_v = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o_batch, - nonnegative_int o_seq_len, - nonnegative_int o_v) { + positive_int o_batch, + positive_int o_seq_len, + positive_int o_v) { return lift_to_parallel_with_degrees( input_v, o_sum, o_eq, - FFOrdered{o_batch, o_seq_len, o_v}); + FFOrdered{o_batch, o_seq_len, o_v}); }; auto make_o = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o_batch, - nonnegative_int o_seq_len, - nonnegative_int o_o) { + positive_int o_batch, + positive_int o_seq_len, + positive_int o_o) { return lift_to_parallel_with_degrees( output, o_sum, o_eq, - FFOrdered{o_batch, o_seq_len, o_o}); + FFOrdered{o_batch, o_seq_len, o_o}); }; auto make_w = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o_e, - nonnegative_int o_h) { + positive_int o_e, + positive_int o_h) { return lift_to_parallel_with_degrees( - weights, o_sum, o_eq, FFOrdered{o_e, o_h}); + weights, o_sum, o_eq, FFOrdered{o_e, o_h}); }; auto make_input_bias = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o_in_proj_channel) { + positive_int o_in_proj_channel) { return lift_to_parallel_with_degrees( input_bias, o_sum, o_eq, - FFOrdered{o_in_proj_channel}); + FFOrdered{o_in_proj_channel}); }; auto make_output_bias = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o_out_proj_channel) { + positive_int o_out_proj_channel) { return lift_to_parallel_with_degrees( output_bias, o_sum, o_eq, - FFOrdered{o_out_proj_channel}); + FFOrdered{o_out_proj_channel}); }; SUBCASE("data parallelism") { - nonnegative_int o_b = 4_n; + positive_int o_b = 4_p; ParallelTensorShape q = - make_q(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n); + make_q(SumDegree{1_p}, DiscardCopyDegree{1_p}, o_b, 1_p, 1_p); ParallelTensorShape k = - make_k(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n); + make_k(SumDegree{1_p}, DiscardCopyDegree{1_p}, o_b, 1_p, 1_p); ParallelTensorShape v = - make_v(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n); + make_v(SumDegree{1_p}, DiscardCopyDegree{1_p}, o_b, 1_p, 1_p); SUBCASE("get_output_shape") { tl::expected result = get_output_shape(attrs, q, k, v); tl::expected correct = - make_o(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n); + make_o(SumDegree{1_p}, DiscardCopyDegree{1_p}, o_b, 1_p, 1_p); CHECK(result == correct); } @@ -279,7 +279,7 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_weights_shape(attrs, q, k, v); tl::expected correct = - make_w(SumDegree{1_n}, DiscardCopyDegree{o_b}, 1_n, 1_n); + make_w(SumDegree{1_p}, DiscardCopyDegree{o_b}, 1_p, 1_p); CHECK(result == correct); } @@ -287,7 +287,7 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_input_bias_shape(attrs, q, k, v); tl::expected correct = - make_input_bias(SumDegree{1_n}, DiscardCopyDegree{o_b}, 1_n); + make_input_bias(SumDegree{1_p}, DiscardCopyDegree{o_b}, 1_p); CHECK(result == correct); } @@ -295,25 +295,25 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_output_bias_shape(attrs, q, k, v); tl::expected correct = - make_output_bias(SumDegree{1_n}, DiscardCopyDegree{o_b}, 1_n); + make_output_bias(SumDegree{1_p}, DiscardCopyDegree{o_b}, 1_p); CHECK(result == correct); } } SUBCASE("attention head parallelism") { - nonnegative_int o_h = 2_n; + positive_int o_h = 2_p; ParallelTensorShape q = - make_q(SumDegree{1_n}, DiscardCopyDegree{o_h}, 1_n, 1_n, 1_n); + make_q(SumDegree{1_p}, DiscardCopyDegree{o_h}, 1_p, 1_p, 1_p); ParallelTensorShape k = - make_k(SumDegree{1_n}, DiscardCopyDegree{o_h}, 1_n, 1_n, 1_n); + make_k(SumDegree{1_p}, DiscardCopyDegree{o_h}, 1_p, 1_p, 1_p); ParallelTensorShape v = - make_v(SumDegree{1_n}, DiscardCopyDegree{o_h}, 1_n, 1_n, 1_n); + make_v(SumDegree{1_p}, DiscardCopyDegree{o_h}, 1_p, 1_p, 1_p); SUBCASE("get_output_shape") { tl::expected result = get_output_shape(attrs, q, k, v); tl::expected correct = - make_o(SumDegree{o_h}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n); + make_o(SumDegree{o_h}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p); CHECK(result == correct); } @@ -321,7 +321,7 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_weights_shape(attrs, q, k, v); tl::expected correct = - make_w(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, o_h); + make_w(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, o_h); CHECK(result == correct); } @@ -329,7 +329,7 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_input_bias_shape(attrs, q, k, v); tl::expected correct = - make_input_bias(SumDegree{1_n}, DiscardCopyDegree{o_h}, 1_n); + make_input_bias(SumDegree{1_p}, DiscardCopyDegree{o_h}, 1_p); CHECK(result == correct); } @@ -337,26 +337,26 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_output_bias_shape(attrs, q, k, v); tl::expected correct = - make_output_bias(SumDegree{1_n}, DiscardCopyDegree{o_h}, 1_n); + make_output_bias(SumDegree{1_p}, DiscardCopyDegree{o_h}, 1_p); CHECK(result == correct); } } SUBCASE("combined data & attention head parallelism") { - nonnegative_int o_b = 4_n; - nonnegative_int o_h = 2_n; + positive_int o_b = 4_p; + positive_int o_h = 2_p; ParallelTensorShape q = - make_q(SumDegree{1_n}, DiscardCopyDegree{o_h}, o_b, 1_n, 1_n); + make_q(SumDegree{1_p}, DiscardCopyDegree{o_h}, o_b, 1_p, 1_p); ParallelTensorShape k = - make_k(SumDegree{1_n}, DiscardCopyDegree{o_h}, o_b, 1_n, 1_n); + make_k(SumDegree{1_p}, DiscardCopyDegree{o_h}, o_b, 1_p, 1_p); ParallelTensorShape v = - make_v(SumDegree{1_n}, DiscardCopyDegree{o_h}, o_b, 1_n, 1_n); + make_v(SumDegree{1_p}, DiscardCopyDegree{o_h}, o_b, 1_p, 1_p); SUBCASE("get_output_shape") { tl::expected result = get_output_shape(attrs, q, k, v); tl::expected correct = - make_o(SumDegree{o_h}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n); + make_o(SumDegree{o_h}, DiscardCopyDegree{1_p}, o_b, 1_p, 1_p); CHECK(result == correct); } @@ -364,7 +364,7 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_weights_shape(attrs, q, k, v); tl::expected correct = - make_w(SumDegree{1_n}, DiscardCopyDegree{o_b}, 1_n, o_h); + make_w(SumDegree{1_p}, DiscardCopyDegree{o_b}, 1_p, o_h); CHECK(result == correct); } @@ -373,7 +373,7 @@ TEST_SUITE(FF_TEST_SUITE) { get_input_bias_shape(attrs, q, k, v); tl::expected correct = make_input_bias( - SumDegree{1_n}, DiscardCopyDegree{o_b * o_h}, 1_n); + SumDegree{1_p}, DiscardCopyDegree{o_b * o_h}, 1_p); CHECK(result == correct); } @@ -382,7 +382,7 @@ TEST_SUITE(FF_TEST_SUITE) { get_output_bias_shape(attrs, q, k, v); tl::expected correct = make_output_bias( - SumDegree{1_n}, DiscardCopyDegree{o_b * o_h}, 1_n); + SumDegree{1_p}, DiscardCopyDegree{o_b * o_h}, 1_p); CHECK(result == correct); } } diff --git a/lib/op-attrs/test/src/op-attrs/ops/batch_matmul.cc b/lib/op-attrs/test/src/op-attrs/ops/batch_matmul.cc index 27c59ee497..d251fb731d 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/batch_matmul.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/batch_matmul.cc @@ -6,10 +6,10 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_output_shape(BatchMatmulAttrs, TensorShape)") { - nonnegative_int b = 4_n; - nonnegative_int m = 6_n; - nonnegative_int n = 8_n; - nonnegative_int p = 10_n; + positive_int b = 4_p; + positive_int m = 6_p; + positive_int n = 8_p; + positive_int p = 10_p; BatchMatmulAttrs attrs = BatchMatmulAttrs{ /*a_seq_length_dim=*/0_n, // TODO figure out if these arguments are @@ -19,7 +19,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_lhs_shape = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ b, n, m, @@ -31,7 +31,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("valid") { TensorShape input_rhs_shape = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ b, m, p, @@ -45,7 +45,7 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected correct_output_shape = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ b, n, p, @@ -60,8 +60,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("mismatched b") { TensorShape input_rhs_shape = TensorShape{ TensorDims{ - FFOrdered{ - b + 1_n, + FFOrdered{ + b + 1_p, m, p, }, @@ -78,9 +78,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("mismatched m") { TensorShape input_rhs_shape = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ b, - m + 1_n, + m + 1_p, p, }, }, @@ -95,15 +95,15 @@ TEST_SUITE(FF_TEST_SUITE) { } TEST_CASE("get_output_shape(BatchMatmulAttrs, ParallelTensorShape)") { - nonnegative_int b = 2_n * 2_n; - nonnegative_int o_b = 2_n; - nonnegative_int m = 3_n * 3_n; - nonnegative_int o_m = 3_n; - nonnegative_int n = 5_n * 5_n; - nonnegative_int o_n = 5_n; - nonnegative_int p = 7_n * 7_n; - nonnegative_int o_p = 7_n; - nonnegative_int o_sum = 11_n; + positive_int b = 2_p * 2_p; + positive_int o_b = 2_p; + positive_int m = 3_p * 3_p; + positive_int o_m = 3_p; + positive_int n = 5_p * 5_p; + positive_int o_n = 5_p; + positive_int p = 7_p * 7_p; + positive_int o_p = 7_p; + positive_int o_sum = 11_p; BatchMatmulAttrs attrs = BatchMatmulAttrs{ /*a_seq_length_dim=*/0_n, // TODO figure out if these arguments are @@ -113,9 +113,9 @@ TEST_SUITE(FF_TEST_SUITE) { auto make_lhs = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o_b, - nonnegative_int o_n, - nonnegative_int o_m) { + positive_int o_b, + positive_int o_n, + positive_int o_m) { return ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ @@ -134,9 +134,9 @@ TEST_SUITE(FF_TEST_SUITE) { auto make_rhs = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o_b, - nonnegative_int o_m, - nonnegative_int o_p) { + positive_int o_b, + positive_int o_m, + positive_int o_p) { return ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ @@ -155,9 +155,9 @@ TEST_SUITE(FF_TEST_SUITE) { auto make_output = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o_b, - nonnegative_int o_n, - nonnegative_int o_p) { + positive_int o_b, + positive_int o_n, + positive_int o_p) { return ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ @@ -177,10 +177,10 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("data parallel") { tl::expected result = get_output_shape( attrs, - make_lhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n), - make_rhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n)); + make_lhs(SumDegree{1_p}, DiscardCopyDegree{1_p}, o_b, 1_p, 1_p), + make_rhs(SumDegree{1_p}, DiscardCopyDegree{1_p}, o_b, 1_p, 1_p)); tl::expected correct = - make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n); + make_output(SumDegree{1_p}, DiscardCopyDegree{1_p}, o_b, 1_p, 1_p); CHECK(result == correct); } @@ -188,10 +188,10 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("n parallel") { tl::expected result = get_output_shape( attrs, - make_lhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, o_n, 1_n), - make_rhs(SumDegree{1_n}, DiscardCopyDegree{o_n}, 1_n, 1_n, 1_n)); + make_lhs(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, o_n, 1_p), + make_rhs(SumDegree{1_p}, DiscardCopyDegree{o_n}, 1_p, 1_p, 1_p)); tl::expected correct = - make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, o_n, 1_n); + make_output(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, o_n, 1_p); CHECK(result == correct); } @@ -199,10 +199,10 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("p parallel") { tl::expected result = get_output_shape( attrs, - make_lhs(SumDegree{1_n}, DiscardCopyDegree{o_p}, 1_n, 1_n, 1_n), - make_rhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, o_p)); + make_lhs(SumDegree{1_p}, DiscardCopyDegree{o_p}, 1_p, 1_p, 1_p), + make_rhs(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, 1_p, o_p)); tl::expected correct = - make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, o_p); + make_output(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, 1_p, o_p); CHECK(result == correct); } @@ -210,10 +210,10 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("reduction parallel") { tl::expected result = get_output_shape( attrs, - make_lhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, o_m), - make_rhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, o_m, 1_n)); + make_lhs(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, 1_p, o_m), + make_rhs(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, o_m, 1_p)); tl::expected correct = - make_output(SumDegree{o_m}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n); + make_output(SumDegree{o_m}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p); CHECK(result == correct); } @@ -221,10 +221,10 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("propagate reduction lhs") { tl::expected result = get_output_shape( attrs, - make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n), - make_rhs(SumDegree{1_n}, DiscardCopyDegree{o_sum}, 1_n, 1_n, 1_n)); + make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p), + make_rhs(SumDegree{1_p}, DiscardCopyDegree{o_sum}, 1_p, 1_p, 1_p)); tl::expected correct = - make_output(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n); + make_output(SumDegree{o_sum}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p); CHECK(result == correct); } @@ -232,10 +232,10 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("propagate reduction rhs") { tl::expected result = get_output_shape( attrs, - make_lhs(SumDegree{1_n}, DiscardCopyDegree{o_sum}, 1_n, 1_n, 1_n), - make_rhs(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n)); + make_lhs(SumDegree{1_p}, DiscardCopyDegree{o_sum}, 1_p, 1_p, 1_p), + make_rhs(SumDegree{o_sum}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p)); tl::expected correct = - make_output(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n); + make_output(SumDegree{o_sum}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p); CHECK(result == correct); } @@ -243,10 +243,10 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("reduction lhs & reduction rhs") { tl::expected result = get_output_shape( attrs, - make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1_n, 1_n, 1_n), - make_rhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1_n, 1_n, 1_n)); + make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1_p, 1_p, 1_p), + make_rhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1_p, 1_p, 1_p)); tl::expected correct = make_output( - SumDegree{o_sum * o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n); + SumDegree{o_sum * o_sum}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p); CHECK(result == correct); } @@ -254,8 +254,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("reduction lhs & rhs (invalid)") { tl::expected result = get_output_shape( attrs, - make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n), - make_rhs(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n)); + make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p), + make_rhs(SumDegree{o_sum}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p)); CHECK_MESSAGE( !result.has_value(), "Unexpected successful value: ", result); @@ -264,11 +264,11 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("reduction lhs & n") { tl::expected result = get_output_shape( attrs, - make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, o_n, 1_n), + make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1_p}, 1_p, o_n, 1_p), make_rhs( - SumDegree{1_n}, DiscardCopyDegree{o_sum * o_n}, 1_n, 1_n, 1_n)); + SumDegree{1_p}, DiscardCopyDegree{o_sum * o_n}, 1_p, 1_p, 1_p)); tl::expected correct = - make_output(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, o_n, 1_n); + make_output(SumDegree{o_sum}, DiscardCopyDegree{1_p}, 1_p, o_n, 1_p); CHECK(result == correct); } @@ -276,11 +276,11 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("reduction lhs & reduction rhs & n") { tl::expected result = get_output_shape( attrs, - make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1_n, o_n, 1_n), + make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1_p, o_n, 1_p), make_rhs( - SumDegree{o_sum}, DiscardCopyDegree{o_sum * o_n}, 1_n, 1_n, 1_n)); + SumDegree{o_sum}, DiscardCopyDegree{o_sum * o_n}, 1_p, 1_p, 1_p)); tl::expected correct = make_output( - SumDegree{o_sum * o_sum}, DiscardCopyDegree{1_n}, 1_n, o_n, 1_n); + SumDegree{o_sum * o_sum}, DiscardCopyDegree{1_p}, 1_p, o_n, 1_p); CHECK(result == correct); } @@ -288,15 +288,15 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("reduction lhs & reduction rhs & n & m") { tl::expected result = get_output_shape( attrs, - make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1_n, o_n, o_m), + make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1_p, o_n, o_m), make_rhs( - SumDegree{o_sum}, DiscardCopyDegree{o_sum * o_n}, 1_n, o_m, 1_n)); + SumDegree{o_sum}, DiscardCopyDegree{o_sum * o_n}, 1_p, o_m, 1_p)); tl::expected correct = make_output(SumDegree{o_sum * o_sum * o_m}, - DiscardCopyDegree{1_n}, - 1_n, + DiscardCopyDegree{1_p}, + 1_p, o_n, - 1_n); + 1_p); CHECK(result == correct); } diff --git a/lib/op-attrs/test/src/op-attrs/ops/batch_norm.cc b/lib/op-attrs/test/src/op-attrs/ops/batch_norm.cc index cd9796945c..b70e8fcb4e 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/batch_norm.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/batch_norm.cc @@ -60,11 +60,11 @@ TEST_SUITE(FF_TEST_SUITE) { }(); TensorShape input = TensorShape{ - TensorDims{FFOrdered{ - 12_n, - 14_n, - 16_n, - 18_n, + TensorDims{FFOrdered{ + 12_p, + 14_p, + 16_p, + 18_p, }}, DataType::FLOAT, }; @@ -72,8 +72,8 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape output = input; TensorShape gamma = TensorShape{ - TensorDims{FFOrdered{ - 14_n, + TensorDims{FFOrdered{ + 14_p, }}, DataType::FLOAT, }; @@ -140,16 +140,16 @@ TEST_SUITE(FF_TEST_SUITE) { }(); SUBCASE("partition parallelism (in channel dim)") { - nonnegative_int degree = 2_n; + positive_int degree = 2_p; ParallelTensorDimDegrees input = ParallelTensorDimDegrees{ - SumDegree{1_n}, - DiscardCopyDegree{1_n}, - FFOrdered{ - 1_n, + SumDegree{1_p}, + DiscardCopyDegree{1_p}, + FFOrdered{ + 1_p, degree, - 1_n, - 1_n, + 1_p, + 1_p, }, }; @@ -169,9 +169,9 @@ TEST_SUITE(FF_TEST_SUITE) { get_gamma_weights_parallel_dim_degrees(attrs_affine_true, input); tl::expected correct = ParallelTensorDimDegrees{ - SumDegree{1_n}, - DiscardCopyDegree{1_n}, - FFOrdered{degree}, + SumDegree{1_p}, + DiscardCopyDegree{1_p}, + FFOrdered{degree}, }; CHECK(result == correct); @@ -194,9 +194,9 @@ TEST_SUITE(FF_TEST_SUITE) { get_beta_weights_parallel_dim_degrees(attrs_affine_true, input); tl::expected correct = ParallelTensorDimDegrees{ - SumDegree{1_n}, - DiscardCopyDegree{1_n}, - FFOrdered{degree}, + SumDegree{1_p}, + DiscardCopyDegree{1_p}, + FFOrdered{degree}, }; CHECK(result == correct); @@ -214,12 +214,12 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("partition parallelism (not in channel dim)") { - nonnegative_int degree = 2_n; + positive_int degree = 2_p; ParallelTensorDimDegrees input = ParallelTensorDimDegrees{ - SumDegree{1_n}, - DiscardCopyDegree{1_n}, - FFOrdered{1_n, 1_n, degree, 1_n}, + SumDegree{1_p}, + DiscardCopyDegree{1_p}, + FFOrdered{1_p, 1_p, degree, 1_p}, }; SUBCASE("get_output_parallel_dim_degrees(BatchNormAttrs, " @@ -251,12 +251,12 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("sum parallelism") { - SumDegree sum_degree = SumDegree{2_n}; + SumDegree sum_degree = SumDegree{2_p}; ParallelTensorDimDegrees input = ParallelTensorDimDegrees{ sum_degree, - DiscardCopyDegree{1_n}, - FFOrdered{1_n, 1_n, 1_n, 1_n}, + DiscardCopyDegree{1_p}, + FFOrdered{1_p, 1_p, 1_p, 1_p}, }; SUBCASE("get_output_parallel_dim_degrees(BatchNormAttrs, " @@ -288,12 +288,12 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("discard copy parallelism") { - DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_n}; + DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_p}; ParallelTensorDimDegrees input = ParallelTensorDimDegrees{ - SumDegree{1_n}, + SumDegree{1_p}, discard_copy_degree, - FFOrdered{1_n, 1_n, 1_n, 1_n}, + FFOrdered{1_p, 1_p, 1_p, 1_p}, }; SUBCASE("get_output_parallel_dim_degrees(BatchNormAttrs, " @@ -340,14 +340,14 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape input = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{12_n, 1_n}, - ShardParallelDim{14_n, 2_n}, - ShardParallelDim{16_n, 1_n}, - ShardParallelDim{18_n, 1_n}, + ShardParallelDim{12_p, 1_p}, + ShardParallelDim{14_p, 2_p}, + ShardParallelDim{16_p, 1_p}, + ShardParallelDim{18_p, 1_p}, }, ReplicaParallelDimSet{ - SumDegree{1_n}, - DiscardCopyDegree{1_n}, + SumDegree{1_p}, + DiscardCopyDegree{1_p}, }, }, DataType::FLOAT, @@ -368,11 +368,11 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{14_n, 2_n}, + ShardParallelDim{14_p, 2_p}, }, ReplicaParallelDimSet{ - SumDegree{1_n}, - DiscardCopyDegree{1_n}, + SumDegree{1_p}, + DiscardCopyDegree{1_p}, }, }, DataType::FLOAT, @@ -388,11 +388,11 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{14_n, 2_n}, + ShardParallelDim{14_p, 2_p}, }, ReplicaParallelDimSet{ - SumDegree{1_n}, - DiscardCopyDegree{1_n}, + SumDegree{1_p}, + DiscardCopyDegree{1_p}, }, }, DataType::FLOAT, diff --git a/lib/op-attrs/test/src/op-attrs/ops/cast.cc b/lib/op-attrs/test/src/op-attrs/ops/cast.cc index e9ec890b4b..eeba779dfe 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/cast.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/cast.cc @@ -12,15 +12,15 @@ TEST_SUITE(FF_TEST_SUITE) { CastAttrs attrs = CastAttrs{output_datatype}; - nonnegative_int d1 = 12_n; - nonnegative_int d2 = 16_n; + positive_int d1 = 12_p; + positive_int d2 = 16_p; TensorShape input = TensorShape{ - TensorDims{FFOrdered{d1, d2}}, + TensorDims{FFOrdered{d1, d2}}, input_datatype, }; TensorShape output = TensorShape{ - TensorDims{FFOrdered{d1, d2}}, + TensorDims{FFOrdered{d1, d2}}, output_datatype, }; @@ -34,30 +34,30 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("get_output_shape(CastAttrs, ParallelTensorShape)") { auto make_input = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o_batch, - nonnegative_int o_features) { + positive_int o_batch, + positive_int o_features) { return lift_to_parallel_with_degrees( input, o_sum, o_eq, - FFOrdered{o_batch, o_features}); + FFOrdered{o_batch, o_features}); }; auto make_output = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o_batch, - nonnegative_int o_outchannels) { + positive_int o_batch, + positive_int o_outchannels) { return lift_to_parallel_with_degrees( output, o_sum, o_eq, - FFOrdered{o_batch, o_outchannels}); + FFOrdered{o_batch, o_outchannels}); }; - SumDegree sum_degree = SumDegree{2_n}; - DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{3_n}; - nonnegative_int batch_degree = 4_n; - nonnegative_int feature_degree = 8_n; + SumDegree sum_degree = SumDegree{2_p}; + DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{3_p}; + positive_int batch_degree = 4_p; + positive_int feature_degree = 8_p; ParallelTensorShape par_input = make_input( sum_degree, discard_copy_degree, batch_degree, feature_degree); diff --git a/lib/op-attrs/test/src/op-attrs/ops/combine.cc b/lib/op-attrs/test/src/op-attrs/ops/combine.cc index 14fbca5b3a..07520e7cce 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/combine.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/combine.cc @@ -10,14 +10,14 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape input = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{12_n, 2_n}, - ShardParallelDim{14_n, 1_n}, - ShardParallelDim{16_n, 3_n}, - ShardParallelDim{18_n, 2_n}, + ShardParallelDim{12_p, 2_p}, + ShardParallelDim{14_p, 1_p}, + ShardParallelDim{16_p, 3_p}, + ShardParallelDim{18_p, 2_p}, }, ReplicaParallelDimSet{ - SumDegree{3_n}, - DiscardCopyDegree{2_n}, + SumDegree{3_p}, + DiscardCopyDegree{2_p}, }, }, DataType::FLOAT, @@ -25,7 +25,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("valid") { ff_dim_t dim = ff_dim_t{2_n}; - nonnegative_int degree = 3_n; + positive_int degree = 3_p; CombineAttrs attrs = CombineAttrs{ /*repartition_dim=*/dim, /*repartition_degree=*/degree, @@ -36,7 +36,8 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected correct = [&] { ParallelTensorShape output = input; - output.dims.shard_dims.at(dim).degree /= degree; + positive_int old_shard_degree = output.dims.shard_dims.at(dim).degree; + output.dims.shard_dims.at(dim).degree = positive_int{old_shard_degree / degree}; return output; }(); @@ -45,7 +46,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("invalid") { ff_dim_t dim = ff_dim_t{2_n}; - nonnegative_int degree = 4_n; + positive_int degree = 4_p; CombineAttrs attrs = CombineAttrs{ /*repartition_dim=*/dim, /*repartition_degree=*/degree, diff --git a/lib/op-attrs/test/src/op-attrs/ops/concat.cc b/lib/op-attrs/test/src/op-attrs/ops/concat.cc index b84cf38753..ee1255161c 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/concat.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/concat.cc @@ -10,7 +10,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_output_shape(ConcatAttrs, std::vector)") { ConcatAttrs attrs = ConcatAttrs{ - ff_dim_t{nonnegative_int{1}}, + ff_dim_t{1_n}, }; SUBCASE("empty input shapes list passed") { @@ -23,12 +23,12 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(result == correct); } - nonnegative_int dim0_size = 12_n; - nonnegative_int dim2_size = 20_n; + positive_int dim0_size = 12_p; + positive_int dim2_size = 20_p; TensorShape input_shape1 = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ dim0_size, - 14_n, + 14_p, dim2_size, }}, DataType::FLOAT, @@ -45,26 +45,26 @@ TEST_SUITE(FF_TEST_SUITE) { } TensorShape input_shape2 = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ dim0_size, - 16_n, + 16_p, dim2_size, }}, DataType::FLOAT, }; TensorShape input_shape3 = TensorShape{ - TensorDims{FFOrdered{dim0_size, 18_n, dim2_size}}, + TensorDims{FFOrdered{dim0_size, 18_p, dim2_size}}, DataType::FLOAT, }; SUBCASE("input shapes do not shared the same num_dims") { TensorShape mismatched_num_dims = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ dim0_size, - 20_n, + 20_p, dim2_size, - 1_n, + 1_p, }}, DataType::FLOAT, }; @@ -81,7 +81,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("concat axis is out of bounds") { attrs = ConcatAttrs{ - ff_dim_t{nonnegative_int{3}}, + ff_dim_t{3_n}, }; std::vector input_shapes = { @@ -101,9 +101,9 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_output_shape(attrs, input_shapes); tl::expected correct = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ dim0_size, - 14_n + 16_n + 18_n, + 14_p + 16_p + 18_p, dim2_size, }}, DataType::FLOAT, @@ -115,100 +115,100 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_output_shape(ConcatAttrs, std::vector)") { ConcatAttrs attrs = ConcatAttrs{ - ff_dim_t{nonnegative_int{1}}, + ff_dim_t{1_n}, }; - nonnegative_int dim0_size = 12_n; - nonnegative_int dim2_size = 20_n; + positive_int dim0_size = 12_p; + positive_int dim2_size = 20_p; TensorShape input_shape1 = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ dim0_size, - 14_n, + 14_p, dim2_size, }}, DataType::FLOAT, }; TensorShape input_shape2 = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ dim0_size, - 16_n, + 16_p, dim2_size, }}, DataType::FLOAT, }; TensorShape input_shape3 = TensorShape{ - TensorDims{FFOrdered{dim0_size, 18_n, dim2_size}}, + TensorDims{FFOrdered{dim0_size, 18_p, dim2_size}}, DataType::FLOAT, }; TensorShape output_shape = TensorShape{ - TensorDims{FFOrdered{ - dim0_size, 14_n + 16_n + 18_n, dim2_size}}, + TensorDims{FFOrdered{ + dim0_size, 14_p + 16_p + 18_p, dim2_size}}, DataType::FLOAT, }; auto lift_input1 = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o0, - nonnegative_int o1, - nonnegative_int o2) { + positive_int o0, + positive_int o1, + positive_int o2) { return lift_to_parallel_with_degrees( - input_shape1, o_sum, o_eq, FFOrdered{o0, o1, o2}); + input_shape1, o_sum, o_eq, FFOrdered{o0, o1, o2}); }; auto lift_input2 = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o0, - nonnegative_int o1, - nonnegative_int o2) { + positive_int o0, + positive_int o1, + positive_int o2) { return lift_to_parallel_with_degrees( - input_shape2, o_sum, o_eq, FFOrdered{o0, o1, o2}); + input_shape2, o_sum, o_eq, FFOrdered{o0, o1, o2}); }; auto lift_input3 = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o0, - nonnegative_int o1, - nonnegative_int o2) { + positive_int o0, + positive_int o1, + positive_int o2) { return lift_to_parallel_with_degrees( - input_shape3, o_sum, o_eq, FFOrdered{o0, o1, o2}); + input_shape3, o_sum, o_eq, FFOrdered{o0, o1, o2}); }; auto lift_output = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o0, - nonnegative_int o1, - nonnegative_int o2) { + positive_int o0, + positive_int o1, + positive_int o2) { return lift_to_parallel_with_degrees( - output_shape, o_sum, o_eq, FFOrdered{o0, o1, o2}); + output_shape, o_sum, o_eq, FFOrdered{o0, o1, o2}); }; SUBCASE("sum reduction parallelism") { SUBCASE("matching") { - SumDegree sum_degree = SumDegree{2_n}; + SumDegree sum_degree = SumDegree{2_p}; std::vector inputs = { - lift_input1(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n), - lift_input2(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n), - lift_input3(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n), + lift_input1(sum_degree, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p), + lift_input2(sum_degree, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p), + lift_input3(sum_degree, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p), }; tl::expected result = get_output_shape(attrs, inputs); tl::expected correct = - lift_output(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n); + lift_output(sum_degree, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p); CHECK(result == correct); } SUBCASE("not matching") { std::vector inputs = { - lift_input1(SumDegree{2_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n), - lift_input2(SumDegree{4_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n), - lift_input3(SumDegree{4_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n), + lift_input1(SumDegree{2_p}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p), + lift_input2(SumDegree{4_p}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p), + lift_input3(SumDegree{4_p}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p), }; std::optional result = @@ -221,27 +221,27 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("discard copy reduction parallelism") { SUBCASE("matching") { - DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_n}; + DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_p}; std::vector inputs = { - lift_input1(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n), - lift_input2(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n), - lift_input3(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n), + lift_input1(SumDegree{1_p}, discard_copy_degree, 1_p, 1_p, 1_p), + lift_input2(SumDegree{1_p}, discard_copy_degree, 1_p, 1_p, 1_p), + lift_input3(SumDegree{1_p}, discard_copy_degree, 1_p, 1_p, 1_p), }; tl::expected result = get_output_shape(attrs, inputs); tl::expected correct = - lift_output(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n); + lift_output(SumDegree{1_p}, discard_copy_degree, 1_p, 1_p, 1_p); CHECK(result == correct); } SUBCASE("not matching") { std::vector inputs = { - lift_input1(SumDegree{1_n}, DiscardCopyDegree{2_n}, 1_n, 1_n, 1_n), - lift_input2(SumDegree{1_n}, DiscardCopyDegree{2_n}, 1_n, 1_n, 1_n), - lift_input3(SumDegree{1_n}, DiscardCopyDegree{4_n}, 1_n, 1_n, 1_n), + lift_input1(SumDegree{1_p}, DiscardCopyDegree{2_p}, 1_p, 1_p, 1_p), + lift_input2(SumDegree{1_p}, DiscardCopyDegree{2_p}, 1_p, 1_p, 1_p), + lift_input3(SumDegree{1_p}, DiscardCopyDegree{4_p}, 1_p, 1_p, 1_p), }; std::optional result = @@ -254,15 +254,15 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("parallelism in axis dim") { SUBCASE("matching") { - nonnegative_int degree = 2_n; + positive_int degree = 2_p; std::vector inputs = { lift_input1( - SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n), + SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, degree, 1_p), lift_input2( - SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n), + SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, degree, 1_p), lift_input3( - SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n), + SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, degree, 1_p), }; std::optional result = @@ -274,9 +274,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("not matching") { std::vector inputs = { - lift_input1(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n), - lift_input2(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n), - lift_input3(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 2_n, 1_n), + lift_input1(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p), + lift_input2(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p), + lift_input3(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, 2_p, 1_p), }; std::optional result = @@ -289,31 +289,31 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("parallelism in non-axis shard dims") { SUBCASE("matching") { - nonnegative_int degree0 = 2_n; - nonnegative_int degree2 = 4_n; + positive_int degree0 = 2_p; + positive_int degree2 = 4_p; std::vector inputs = { lift_input1( - SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2), + SumDegree{1_p}, DiscardCopyDegree{1_p}, degree0, 1_p, degree2), lift_input2( - SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2), + SumDegree{1_p}, DiscardCopyDegree{1_p}, degree0, 1_p, degree2), lift_input3( - SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2), + SumDegree{1_p}, DiscardCopyDegree{1_p}, degree0, 1_p, degree2), }; tl::expected result = get_output_shape(attrs, inputs); tl::expected correct = lift_output( - SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2); + SumDegree{1_p}, DiscardCopyDegree{1_p}, degree0, 1_p, degree2); CHECK(result == correct); } SUBCASE("not matching") { std::vector inputs = { - lift_input1(SumDegree{1_n}, DiscardCopyDegree{1_n}, 2_n, 1_n, 4_n), - lift_input2(SumDegree{1_n}, DiscardCopyDegree{1_n}, 4_n, 1_n, 2_n), - lift_input3(SumDegree{1_n}, DiscardCopyDegree{1_n}, 4_n, 1_n, 2_n), + lift_input1(SumDegree{1_p}, DiscardCopyDegree{1_p}, 2_p, 1_p, 4_p), + lift_input2(SumDegree{1_p}, DiscardCopyDegree{1_p}, 4_p, 1_p, 2_p), + lift_input3(SumDegree{1_p}, DiscardCopyDegree{1_p}, 4_p, 1_p, 2_p), }; std::optional result = @@ -325,21 +325,21 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("parallelism degrees are not mutually exclusive") { - SumDegree sum_degree = SumDegree{3_n}; - DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{5_n}; - nonnegative_int degree0 = 2_n; - nonnegative_int degree2 = 4_n; + SumDegree sum_degree = SumDegree{3_p}; + DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{5_p}; + positive_int degree0 = 2_p; + positive_int degree2 = 4_p; std::vector inputs = { - lift_input1(sum_degree, discard_copy_degree, degree0, 1_n, degree2), - lift_input2(sum_degree, discard_copy_degree, degree0, 1_n, degree2), - lift_input3(sum_degree, discard_copy_degree, degree0, 1_n, degree2), + lift_input1(sum_degree, discard_copy_degree, degree0, 1_p, degree2), + lift_input2(sum_degree, discard_copy_degree, degree0, 1_p, degree2), + lift_input3(sum_degree, discard_copy_degree, degree0, 1_p, degree2), }; tl::expected result = get_output_shape(attrs, inputs); tl::expected correct = - lift_output(sum_degree, discard_copy_degree, degree0, 1_n, degree2); + lift_output(sum_degree, discard_copy_degree, degree0, 1_p, degree2); CHECK(result == correct); } diff --git a/lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc b/lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc index f5006d4352..67b6bbadb8 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc @@ -7,14 +7,14 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_conv2d_incoming_tensor_roles(Conv2DAttrs") { auto make_attrs = [](bool use_bias) { - return Conv2DAttrs{/*out_channels=*/4_n, - /*kernel_h=*/3_n, - /*kernel_w=*/2_n, - /*stride_h=*/2_n, - /*stride_w=*/2_n, + return Conv2DAttrs{/*out_channels=*/4_p, + /*kernel_h=*/3_p, + /*kernel_w=*/2_p, + /*stride_h=*/2_p, + /*stride_w=*/2_p, /*padding_h=*/1_n, /*padding_w=*/1_n, - /*groups=*/1_n, + /*groups=*/1_p, /*activation=*/std::nullopt, /*use_bias=*/use_bias}; }; @@ -48,14 +48,14 @@ TEST_SUITE(FF_TEST_SUITE) { } TEST_CASE("Conv2D shape inference") { - nonnegative_int out_channels = 4_n; - nonnegative_int kernel_h = 3_n; - nonnegative_int kernel_w = 2_n; - nonnegative_int stride_h = 2_n; - nonnegative_int stride_w = 2_n; + positive_int out_channels = 4_p; + positive_int kernel_h = 3_p; + positive_int kernel_w = 2_p; + positive_int stride_h = 2_p; + positive_int stride_w = 2_p; nonnegative_int padding_h = 1_n; nonnegative_int padding_w = 1_n; - nonnegative_int groups = 1_n; + positive_int groups = 1_p; std::optional activation = std::nullopt; bool use_bias = true; @@ -72,13 +72,13 @@ TEST_SUITE(FF_TEST_SUITE) { /*use_bias=*/true, }; - nonnegative_int num_samples = 7_n; - nonnegative_int input_channels = 4_n; - nonnegative_int input_height = 11_n; - nonnegative_int input_width = 15_n; + positive_int num_samples = 7_p; + positive_int input_channels = 4_p; + positive_int input_height = 11_p; + positive_int input_width = 15_p; TensorShape input = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ num_samples, input_channels, input_height, @@ -87,11 +87,11 @@ TEST_SUITE(FF_TEST_SUITE) { DataType::FLOAT, }; - nonnegative_int output_height = 6_n; - nonnegative_int output_width = 8_n; + positive_int output_height = 6_p; + positive_int output_width = 8_p; TensorShape output = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ num_samples, out_channels, output_height, @@ -101,7 +101,7 @@ TEST_SUITE(FF_TEST_SUITE) { }; TensorShape kernel = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ out_channels, input_channels, kernel_h, @@ -111,7 +111,7 @@ TEST_SUITE(FF_TEST_SUITE) { }; TensorShape bias = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ out_channels, }}, DataType::FLOAT, @@ -137,149 +137,149 @@ TEST_SUITE(FF_TEST_SUITE) { auto make_input = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o_n, - nonnegative_int o_c, - nonnegative_int o_h, - nonnegative_int o_w) { + positive_int o_n, + positive_int o_c, + positive_int o_h, + positive_int o_w) { return lift_to_parallel_with_degrees( - input, o_sum, o_eq, FFOrdered{o_n, o_c, o_h, o_w}); + input, o_sum, o_eq, FFOrdered{o_n, o_c, o_h, o_w}); }; auto make_output = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o_n, - nonnegative_int o_c, - nonnegative_int o_h, - nonnegative_int o_w) { + positive_int o_n, + positive_int o_c, + positive_int o_h, + positive_int o_w) { return lift_to_parallel_with_degrees( - output, o_sum, o_eq, FFOrdered{o_n, o_c, o_h, o_w}); + output, o_sum, o_eq, FFOrdered{o_n, o_c, o_h, o_w}); }; auto make_kernel = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o_outchannels, - nonnegative_int o_inchannels, - nonnegative_int o_kernel_h, - nonnegative_int o_kernel_w) { + positive_int o_outchannels, + positive_int o_inchannels, + positive_int o_kernel_h, + positive_int o_kernel_w) { return lift_to_parallel_with_degrees( kernel, o_sum, o_eq, - FFOrdered{ + FFOrdered{ o_outchannels, o_inchannels, o_kernel_h, o_kernel_w}); }; auto make_bias = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o_outchannels) { + positive_int o_outchannels) { return lift_to_parallel_with_degrees( - bias, o_sum, o_eq, FFOrdered{o_outchannels}); + bias, o_sum, o_eq, FFOrdered{o_outchannels}); }; SUBCASE("data parallelism") { - nonnegative_int degree = 2_n; + positive_int degree = 2_p; ParallelTensorShape par_input = make_input( - SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n, 1_n); + SumDegree{1_p}, DiscardCopyDegree{1_p}, degree, 1_p, 1_p, 1_p); SUBCASE("get_output_shape") { ParallelTensorShape result = get_output_shape(attrs, par_input); ParallelTensorShape correct = make_output( - SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n, 1_n); + SumDegree{1_p}, DiscardCopyDegree{1_p}, degree, 1_p, 1_p, 1_p); CHECK(result == correct); } SUBCASE("get_kernel_shape") { ParallelTensorShape result = get_kernel_shape(attrs, par_input); ParallelTensorShape correct = make_kernel( - SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n, 1_n); + SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p, 1_p, 1_p, 1_p); CHECK(result == correct); } SUBCASE("get_bias_shape") { ParallelTensorShape result = get_bias_shape(attrs, par_input); ParallelTensorShape correct = - make_bias(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n); + make_bias(SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p); CHECK(result == correct); } } SUBCASE("input channel parallelism") { - nonnegative_int degree = 2_n; + positive_int degree = 2_p; ParallelTensorShape par_input = make_input( - SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n, 1_n); + SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, degree, 1_p, 1_p); SUBCASE("get_output_shape") { ParallelTensorShape result = get_output_shape(attrs, par_input); ParallelTensorShape correct = make_output( - SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n, 1_n); + SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p, 1_p); CHECK(result == correct); } SUBCASE("get_kernel_shape") { ParallelTensorShape result = get_kernel_shape(attrs, par_input); ParallelTensorShape correct = make_kernel( - SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n, 1_n); + SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, degree, 1_p, 1_p); CHECK(result == correct); } SUBCASE("get_bias_shape") { ParallelTensorShape result = get_bias_shape(attrs, par_input); ParallelTensorShape correct = - make_bias(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n); + make_bias(SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p); CHECK(result == correct); } } SUBCASE("output channel parallelism") { - nonnegative_int degree = 2_n; + positive_int degree = 2_p; ParallelTensorShape par_input = make_input( - SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n, 1_n); + SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p, 1_p, 1_p, 1_p); SUBCASE("get_output_shape") { ParallelTensorShape result = get_output_shape(attrs, par_input); ParallelTensorShape correct = make_output( - SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n, 1_n); + SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, degree, 1_p, 1_p); CHECK(result == correct); } SUBCASE("get_kernel_shape") { ParallelTensorShape result = get_kernel_shape(attrs, par_input); ParallelTensorShape correct = make_kernel( - SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n, 1_n); + SumDegree{1_p}, DiscardCopyDegree{1_p}, degree, 1_p, 1_p, 1_p); CHECK(result == correct); } SUBCASE("get_bias_shape") { ParallelTensorShape result = get_bias_shape(attrs, par_input); ParallelTensorShape correct = - make_bias(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree); + make_bias(SumDegree{1_p}, DiscardCopyDegree{1_p}, degree); CHECK(result == correct); } } SUBCASE("propagating sum degree") { - nonnegative_int degree = 2_n; + positive_int degree = 2_p; ParallelTensorShape par_input = make_input( - SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n, 1_n); + SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p, 1_p); SUBCASE("get_output_shape") { ParallelTensorShape result = get_output_shape(attrs, par_input); ParallelTensorShape correct = make_output( - SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n, 1_n); + SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p, 1_p); CHECK(result == correct); } SUBCASE("get_kernel_shape") { ParallelTensorShape result = get_kernel_shape(attrs, par_input); ParallelTensorShape correct = make_kernel( - SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n, 1_n); + SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p, 1_p, 1_p, 1_p); CHECK(result == correct); } SUBCASE("get_bias_shape") { ParallelTensorShape result = get_bias_shape(attrs, par_input); ParallelTensorShape correct = - make_bias(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n); + make_bias(SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p); CHECK(result == correct); } } diff --git a/lib/op-attrs/test/src/op-attrs/ops/dropout.cc b/lib/op-attrs/test/src/op-attrs/ops/dropout.cc index e1a03a7613..194a93387b 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/dropout.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/dropout.cc @@ -15,10 +15,10 @@ TEST_SUITE(FF_TEST_SUITE) { }; TensorShape input = TensorShape{ - TensorDims{FFOrdered{ - 12_n, - 14_n, - 16_n, + TensorDims{FFOrdered{ + 12_p, + 14_p, + 16_p, }}, DataType::FLOAT, }; @@ -36,10 +36,10 @@ TEST_SUITE(FF_TEST_SUITE) { }; TensorShape input = TensorShape{ - TensorDims{FFOrdered{ - 12_n, - 14_n, - 16_n, + TensorDims{FFOrdered{ + 12_p, + 14_p, + 16_p, }}, DataType::FLOAT, }; @@ -48,42 +48,42 @@ TEST_SUITE(FF_TEST_SUITE) { auto make_input = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o0, - nonnegative_int o1, - nonnegative_int o2) { + positive_int o0, + positive_int o1, + positive_int o2) { return lift_to_parallel_with_degrees( - input, o_sum, o_eq, FFOrdered{o0, o1, o2}); + input, o_sum, o_eq, FFOrdered{o0, o1, o2}); }; auto make_output = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o0, - nonnegative_int o1, - nonnegative_int o2) { + positive_int o0, + positive_int o1, + positive_int o2) { return lift_to_parallel_with_degrees( - output, o_sum, o_eq, FFOrdered{o0, o1, o2}); + output, o_sum, o_eq, FFOrdered{o0, o1, o2}); }; SUBCASE("partition parallelism (allowed)") { - nonnegative_int degree0 = 2_n; - nonnegative_int degree2 = 4_n; + positive_int degree0 = 2_p; + positive_int degree2 = 4_p; ParallelTensorShape par_input = make_input( - SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2); + SumDegree{1_p}, DiscardCopyDegree{1_p}, degree0, 1_p, degree2); tl::expected result = get_output_shape(attrs, par_input); tl::expected correct = make_output( - SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2); + SumDegree{1_p}, DiscardCopyDegree{1_p}, degree0, 1_p, degree2); CHECK(result == correct); } SUBCASE("sum parallelism (not allowed)") { - SumDegree sum_degree = SumDegree{2_n}; + SumDegree sum_degree = SumDegree{2_p}; ParallelTensorShape par_input = - make_input(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n); + make_input(sum_degree, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p); std::optional result = optional_from_expected(get_output_shape(attrs, par_input)); @@ -93,10 +93,10 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("discard copy parallelism (not allowed)") { - DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_n}; + DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_p}; ParallelTensorShape par_input = - make_input(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n); + make_input(SumDegree{1_p}, discard_copy_degree, 1_p, 1_p, 1_p); std::optional result = optional_from_expected(get_output_shape(attrs, par_input)); diff --git a/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc b/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc index d6a92036f0..4ef34c666e 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc @@ -7,9 +7,9 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("EWAdd shape inference") { - nonnegative_int d1 = 16_n; - nonnegative_int d2 = 32_n; - nonnegative_int d3 = 24_n; + positive_int d1 = 16_p; + positive_int d2 = 32_p; + positive_int d3 = 24_p; ElementBinaryAttrs attrs = ElementBinaryAttrs{ OperatorType::EW_ADD, @@ -20,7 +20,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_lhs = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ d1, d2, d3, @@ -41,7 +41,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("mismatched dim size") { TensorShape incorrect_rhs = input_lhs; - dim_at_idx(incorrect_rhs, relative_ff_dim_t{0}) += 1_n; + dim_at_idx(incorrect_rhs, relative_ff_dim_t{0}) += 1_p; tl::expected result = get_output_shape(attrs, input_lhs, incorrect_rhs); @@ -53,9 +53,9 @@ TEST_SUITE(FF_TEST_SUITE) { } TEST_CASE("EWAdd parallel shape inference") { - nonnegative_int d1 = 16_n; - nonnegative_int d2 = 32_n; - nonnegative_int d3 = 24_n; + positive_int d1 = 16_p; + positive_int d2 = 32_p; + positive_int d3 = 24_p; ElementBinaryAttrs attrs = ElementBinaryAttrs{ OperatorType::EW_ADD, @@ -66,7 +66,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape unpar_lhs = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ d1, d2, d3, @@ -83,68 +83,68 @@ TEST_SUITE(FF_TEST_SUITE) { auto make_lhs = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o_1, - nonnegative_int o_2, - nonnegative_int o_3) { + positive_int o_1, + positive_int o_2, + positive_int o_3) { return lift_to_parallel_with_degrees( - unpar_lhs, o_sum, o_eq, FFOrdered{o_1, o_2, o_3}); + unpar_lhs, o_sum, o_eq, FFOrdered{o_1, o_2, o_3}); }; auto make_rhs = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o_1, - nonnegative_int o_2, - nonnegative_int o_3) { + positive_int o_1, + positive_int o_2, + positive_int o_3) { return lift_to_parallel_with_degrees( - unpar_rhs, o_sum, o_eq, FFOrdered{o_1, o_2, o_3}); + unpar_rhs, o_sum, o_eq, FFOrdered{o_1, o_2, o_3}); }; auto make_output = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o_1, - nonnegative_int o_2, - nonnegative_int o_3) { + positive_int o_1, + positive_int o_2, + positive_int o_3) { return lift_to_parallel_with_degrees( - unpar_output, o_sum, o_eq, FFOrdered{o_1, o_2, o_3}); + unpar_output, o_sum, o_eq, FFOrdered{o_1, o_2, o_3}); }; SUBCASE("data parallelism") { - nonnegative_int degree = 4_n; + positive_int degree = 4_p; ParallelTensorShape input_lhs = - make_lhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n); + make_lhs(SumDegree{1_p}, DiscardCopyDegree{1_p}, degree, 1_p, 1_p); ParallelTensorShape input_rhs = - make_rhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n); + make_rhs(SumDegree{1_p}, DiscardCopyDegree{1_p}, degree, 1_p, 1_p); tl::expected result = get_output_shape(attrs, input_lhs, input_rhs); tl::expected correct = - make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n); + make_output(SumDegree{1_p}, DiscardCopyDegree{1_p}, degree, 1_p, 1_p); CHECK(result == correct); } SUBCASE("reduction parallelism") { - nonnegative_int degree = 4_n; + positive_int degree = 4_p; ParallelTensorShape input_lhs = - make_lhs(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n); + make_lhs(SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p); ParallelTensorShape input_rhs = - make_rhs(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n); + make_rhs(SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p); tl::expected result = get_output_shape(attrs, input_lhs, input_rhs); tl::expected correct = - make_output(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n); + make_output(SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p); CHECK(result == correct); } SUBCASE("invalid discard copy parallelism") { - nonnegative_int degree = 4_n; + positive_int degree = 4_p; ParallelTensorShape input_lhs = - make_lhs(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n); + make_lhs(SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p, 1_p, 1_p); ParallelTensorShape input_rhs = - make_rhs(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n); + make_rhs(SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p, 1_p, 1_p); tl::expected result = get_output_shape(attrs, input_lhs, input_rhs); @@ -154,12 +154,12 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("invalid mismatched parallelism degrees") { - nonnegative_int degree = 4_n; + positive_int degree = 4_p; ParallelTensorShape input_lhs = - make_lhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n); + make_lhs(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, degree, 1_p); ParallelTensorShape input_rhs = - make_rhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, degree); + make_rhs(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, 1_p, degree); tl::expected result = get_output_shape(attrs, input_lhs, input_rhs); diff --git a/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc b/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc index bac6efba3f..355feb4c5f 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc @@ -7,16 +7,16 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("ReLU shape inference") { - nonnegative_int d1 = 16_n; - nonnegative_int d2 = 32_n; - nonnegative_int d3 = 24_n; + positive_int d1 = 16_p; + positive_int d2 = 32_p; + positive_int d3 = 24_p; ElementUnaryAttrs attrs = ElementUnaryAttrs{OperatorType::RELU, std::nullopt}; TensorShape input = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ d1, d2, d3, @@ -33,18 +33,18 @@ TEST_SUITE(FF_TEST_SUITE) { auto make_input = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o_1, - nonnegative_int o_2, - nonnegative_int o_3) { + positive_int o_1, + positive_int o_2, + positive_int o_3) { return lift_to_parallel_with_degrees( - input, o_sum, o_eq, FFOrdered{o_1, o_2, o_3}); + input, o_sum, o_eq, FFOrdered{o_1, o_2, o_3}); }; SUBCASE("partition i.e., sharding parallelism") { - nonnegative_int degree1 = 4_n; - nonnegative_int degree2 = 8_n; + positive_int degree1 = 4_p; + positive_int degree2 = 8_p; ParallelTensorShape par_input = make_input( - SumDegree{1_n}, DiscardCopyDegree{1_n}, degree1, 1_n, degree2); + SumDegree{1_p}, DiscardCopyDegree{1_p}, degree1, 1_p, degree2); tl::expected result = get_output_shape(attrs, par_input); @@ -54,11 +54,11 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("sum degree > 1") { - nonnegative_int degree = 2_n; + positive_int degree = 2_p; tl::expected result = get_output_shape( attrs, - make_input(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n)); + make_input(SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p)); CHECK_MESSAGE(!result.has_value(), "Unexpected successful result: ", @@ -66,11 +66,11 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("discard copy degree > 1") { - nonnegative_int degree = 2_n; + positive_int degree = 2_p; tl::expected result = get_output_shape( attrs, - make_input(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n)); + make_input(SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p, 1_p, 1_p)); CHECK_MESSAGE(!result.has_value(), "Unexpected successful result: ", diff --git a/lib/op-attrs/test/src/op-attrs/ops/embedding.cc b/lib/op-attrs/test/src/op-attrs/ops/embedding.cc index 8fe50a4217..7d43b45dd0 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/embedding.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/embedding.cc @@ -8,8 +8,8 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Sum embedding shape inference") { - nonnegative_int out_channels = 128_n; - nonnegative_int num_entries = 1024_n; + positive_int out_channels = 128_p; + positive_int num_entries = 1024_p; EmbeddingAttrs attrs = EmbeddingAttrs{ /*num_entries=*/num_entries, /*out_channels=*/out_channels, @@ -17,11 +17,11 @@ TEST_SUITE(FF_TEST_SUITE) { /*data_type=*/DataType::FLOAT, }; - nonnegative_int batch_size = 48_n; - nonnegative_int features_dim = 56_n; + positive_int batch_size = 48_p; + positive_int features_dim = 56_p; TensorShape input = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ batch_size, features_dim, }}, @@ -30,7 +30,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape output = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ batch_size, out_channels, }, @@ -40,7 +40,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape weights = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ num_entries, out_channels, }, @@ -66,44 +66,44 @@ TEST_SUITE(FF_TEST_SUITE) { auto make_input = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o_batch, - nonnegative_int o_features) { + positive_int o_batch, + positive_int o_features) { return lift_to_parallel_with_degrees( - input, o_sum, o_eq, FFOrdered{o_batch, o_features}); + input, o_sum, o_eq, FFOrdered{o_batch, o_features}); }; auto make_output = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o_batch, - nonnegative_int o_outchannels) { + positive_int o_batch, + positive_int o_outchannels) { return lift_to_parallel_with_degrees( output, o_sum, o_eq, - FFOrdered{o_batch, o_outchannels}); + FFOrdered{o_batch, o_outchannels}); }; auto make_weights = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o_entries, - nonnegative_int o_outchannels) { + positive_int o_entries, + positive_int o_outchannels) { return lift_to_parallel_with_degrees( weights, o_sum, o_eq, - FFOrdered{o_entries, o_outchannels}); + FFOrdered{o_entries, o_outchannels}); }; SUBCASE("data parallelism") { - nonnegative_int degree = 4_n; + positive_int degree = 4_p; ParallelTensorShape par_input = - make_input(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n); + make_input(SumDegree{1_p}, DiscardCopyDegree{1_p}, degree, 1_p); { tl::expected result = get_output_shape(attrs, par_input); tl::expected correct = - make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n); + make_output(SumDegree{1_p}, DiscardCopyDegree{1_p}, degree, 1_p); CHECK(result == correct); } @@ -111,21 +111,21 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_weights_shape(attrs, par_input); tl::expected correct = - make_weights(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n); + make_weights(SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p, 1_p); CHECK(result == correct); } } SUBCASE("input features parallelism") { - nonnegative_int degree = 4_n; + positive_int degree = 4_p; ParallelTensorShape input = - make_input(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree); + make_input(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, degree); { tl::expected result = get_output_shape(attrs, input); tl::expected correct = - make_output(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n); + make_output(SumDegree{degree}, DiscardCopyDegree{1_p}, 1_p, 1_p); CHECK(result == correct); } @@ -133,7 +133,7 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_weights_shape(attrs, input); tl::expected correct = - make_weights(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n); + make_weights(SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p, 1_p); CHECK(result == correct); } } @@ -145,15 +145,15 @@ TEST_SUITE(FF_TEST_SUITE) { // dimension. For now we choose to represent parallelism in the channel // dimension, but partitioning in the entry dimension is also potentially // useful as it produces sum parallelism in the output - nonnegative_int degree = 4_n; + positive_int degree = 4_p; ParallelTensorShape input = - make_input(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n); + make_input(SumDegree{1_p}, DiscardCopyDegree{degree}, 1_p, 1_p); { tl::expected result = get_output_shape(attrs, input); tl::expected correct = - make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree); + make_output(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, degree); CHECK(result == correct); } @@ -161,7 +161,7 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_weights_shape(attrs, input); tl::expected correct = - make_weights(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree); + make_weights(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, degree); CHECK(result == correct); } } diff --git a/lib/op-attrs/test/src/op-attrs/ops/flat.cc b/lib/op-attrs/test/src/op-attrs/ops/flat.cc index ebd869b3e5..c4fe8a5250 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/flat.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/flat.cc @@ -9,11 +9,11 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_output_shape(FlatAttrs, TensorShape)") { TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ - 2_n, - 4_n, - 2_n, - 3_n, + TensorDims{FFOrdered{ + 2_p, + 4_p, + 2_p, + 3_p, }}, DataType::FLOAT, }; @@ -26,8 +26,8 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape result = get_output_shape(attrs, input_shape); TensorShape correct = TensorShape{ - TensorDims{FFOrdered{ - 2_n * 4_n * 2_n * 3_n, + TensorDims{FFOrdered{ + 2_p * 4_p * 2_p * 3_p, }}, DataType::FLOAT, }; @@ -37,16 +37,16 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("flatten trailing dims") { FlatAttrs attrs = FlatAttrs{ - /*start_dim=*/ff_dim_t{nonnegative_int{2}}, - /*end_dim=*/ff_dim_t{nonnegative_int{4}}, + /*start_dim=*/ff_dim_t{2_n}, + /*end_dim=*/ff_dim_t{4_n}, }; TensorShape result = get_output_shape(attrs, input_shape); TensorShape correct = TensorShape{ - TensorDims{FFOrdered{ - 2_n, - 4_n, - 2_n * 3_n, + TensorDims{FFOrdered{ + 2_p, + 4_p, + 2_p * 3_p, }}, DataType::FLOAT, }; @@ -56,16 +56,16 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("flatten leading dims") { FlatAttrs attrs = FlatAttrs{ - /*start_dim=*/ff_dim_t{nonnegative_int{0}}, - /*end_dim=*/ff_dim_t{nonnegative_int{2}}, + /*start_dim=*/ff_dim_t{0_n}, + /*end_dim=*/ff_dim_t{2_n}, }; TensorShape result = get_output_shape(attrs, input_shape); TensorShape correct = TensorShape{ - TensorDims{FFOrdered{ - 2_n * 4_n, - 2_n, - 3_n, + TensorDims{FFOrdered{ + 2_p * 4_p, + 2_p, + 3_p, }}, DataType::FLOAT, }; @@ -75,16 +75,16 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("flatten middle dims") { FlatAttrs attrs = FlatAttrs{ - /*start_dim=*/ff_dim_t{nonnegative_int{1}}, - /*end_dim=*/ff_dim_t{nonnegative_int{3}}, + /*start_dim=*/ff_dim_t{1_n}, + /*end_dim=*/ff_dim_t{3_n}, }; TensorShape result = get_output_shape(attrs, input_shape); TensorShape correct = TensorShape{ - TensorDims{FFOrdered{ - 2_n, - 4_n * 2_n, - 3_n, + TensorDims{FFOrdered{ + 2_p, + 4_p * 2_p, + 3_p, }}, DataType::FLOAT, }; @@ -94,8 +94,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("flatten no dims (start_dim == end_dim)") { FlatAttrs attrs = FlatAttrs{ - /*start_dim=*/ff_dim_t{nonnegative_int{2}}, - /*end_dim=*/ff_dim_t{nonnegative_int{2}}, + /*start_dim=*/ff_dim_t{2_n}, + /*end_dim=*/ff_dim_t{2_n}, }; TensorShape result = get_output_shape(attrs, input_shape); @@ -106,8 +106,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("flatten no dims (start_dim < end_dim)") { FlatAttrs attrs = FlatAttrs{ - /*start_dim=*/ff_dim_t{nonnegative_int{2}}, - /*end_dim=*/ff_dim_t{nonnegative_int{1}}, + /*start_dim=*/ff_dim_t{2_n}, + /*end_dim=*/ff_dim_t{1_n}, }; TensorShape result = get_output_shape(attrs, input_shape); @@ -119,23 +119,23 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE( "get_output_parallel_dim_degrees(FlatAttrs, ParallelTensorDimDegrees)") { - FlatAttrs attrs = FlatAttrs{/*start_dim=*/ff_dim_t{nonnegative_int{1}}, - /*end_dim=*/ff_dim_t{nonnegative_int{3}}}; + FlatAttrs attrs = FlatAttrs{/*start_dim=*/ff_dim_t{1_n}, + /*end_dim=*/ff_dim_t{3_n}}; SUBCASE("allows shard parallelism in non-flattened dims") { ParallelTensorDimDegrees input = ParallelTensorDimDegrees{ - SumDegree{1_n}, - DiscardCopyDegree{1_n}, - FFOrdered{2_n, 1_n, 1_n, 3_n}, + SumDegree{1_p}, + DiscardCopyDegree{1_p}, + FFOrdered{2_p, 1_p, 1_p, 3_p}, }; tl::expected result = get_output_parallel_dim_degrees(attrs, input); tl::expected correct = ParallelTensorDimDegrees{ - SumDegree{1_n}, - DiscardCopyDegree{1_n}, - FFOrdered{2_n, 1_n, 3_n}, + SumDegree{1_p}, + DiscardCopyDegree{1_p}, + FFOrdered{2_p, 1_p, 3_p}, }; CHECK(result == correct); @@ -143,9 +143,9 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("does not allow shard parallelism in flattened dims") { ParallelTensorDimDegrees input = ParallelTensorDimDegrees{ - SumDegree{1_n}, - DiscardCopyDegree{1_n}, - FFOrdered{1_n, 1_n, 2_n, 1_n}, + SumDegree{1_p}, + DiscardCopyDegree{1_p}, + FFOrdered{1_p, 1_p, 2_p, 1_p}, }; std::optional result = @@ -157,18 +157,18 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("allows sum parallelism") { ParallelTensorDimDegrees input = ParallelTensorDimDegrees{ - SumDegree{2_n}, - DiscardCopyDegree{1_n}, - FFOrdered{1_n, 1_n, 1_n, 1_n}, + SumDegree{2_p}, + DiscardCopyDegree{1_p}, + FFOrdered{1_p, 1_p, 1_p, 1_p}, }; std::optional result = optional_from_expected(get_output_parallel_dim_degrees(attrs, input)); std::optional correct = ParallelTensorDimDegrees{ - SumDegree{2_n}, - DiscardCopyDegree{1_n}, - FFOrdered{1_n, 1_n, 1_n}, + SumDegree{2_p}, + DiscardCopyDegree{1_p}, + FFOrdered{1_p, 1_p, 1_p}, }; CHECK(result == correct); @@ -176,18 +176,18 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("allows discard copy parallelism") { ParallelTensorDimDegrees input = ParallelTensorDimDegrees{ - SumDegree{1_n}, - DiscardCopyDegree{2_n}, - FFOrdered{1_n, 1_n, 1_n, 1_n}, + SumDegree{1_p}, + DiscardCopyDegree{2_p}, + FFOrdered{1_p, 1_p, 1_p, 1_p}, }; std::optional result = optional_from_expected(get_output_parallel_dim_degrees(attrs, input)); std::optional correct = ParallelTensorDimDegrees{ - SumDegree{1_n}, - DiscardCopyDegree{2_n}, - FFOrdered{1_n, 1_n, 1_n}, + SumDegree{1_p}, + DiscardCopyDegree{2_p}, + FFOrdered{1_p, 1_p, 1_p}, }; CHECK(result == correct); @@ -203,22 +203,22 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape input_shape = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{4_n, 2_n}, - ShardParallelDim{8_n, 1_n}, - ShardParallelDim{6_n, 1_n}, - ShardParallelDim{9_n, 3_n}, + ShardParallelDim{4_p, 2_p}, + ShardParallelDim{8_p, 1_p}, + ShardParallelDim{6_p, 1_p}, + ShardParallelDim{9_p, 3_p}, }, ReplicaParallelDimSet{ - SumDegree{7_n}, - DiscardCopyDegree{5_n}, + SumDegree{7_p}, + DiscardCopyDegree{5_p}, }, }, DataType::FLOAT, }; FlatAttrs attrs = FlatAttrs{ - /*start_dim=*/ff_dim_t{nonnegative_int{1_n}}, - /*end_dim=*/ff_dim_t{nonnegative_int{3_n}}, + /*start_dim=*/ff_dim_t{nonnegative_int{1_p}}, + /*end_dim=*/ff_dim_t{nonnegative_int{3_p}}, }; tl::expected result = @@ -227,13 +227,13 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{4_n, 2_n}, - ShardParallelDim{8_n * 6_n, 1_n}, - ShardParallelDim{9_n, 3_n}, + ShardParallelDim{4_p, 2_p}, + ShardParallelDim{8_p * 6_p, 1_p}, + ShardParallelDim{9_p, 3_p}, }, ReplicaParallelDimSet{ - SumDegree{7_n}, - DiscardCopyDegree{5_n}, + SumDegree{7_p}, + DiscardCopyDegree{5_p}, }, }, DataType::FLOAT, diff --git a/lib/op-attrs/test/src/op-attrs/ops/layer_norm.cc b/lib/op-attrs/test/src/op-attrs/ops/layer_norm.cc index b9aa3c0677..ba311ffb1a 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/layer_norm.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/layer_norm.cc @@ -11,7 +11,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_layer_norm_incoming_tensor_roles(LayerNormAttrs)") { auto make_attrs = [](bool elementwise_affine) { return LayerNormAttrs{ - /*axes=*/{ff_dim_t{nonnegative_int{0}}, ff_dim_t{nonnegative_int{2}}}, + /*axes=*/{ff_dim_t{0_n}, ff_dim_t{2_n}}, elementwise_affine, /*eps=*/1.0, }; @@ -46,7 +46,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("shape inference (LayerNorm)") { LayerNormAttrs attrs_affine_true = LayerNormAttrs{ - /*axes=*/{ff_dim_t{nonnegative_int{1}}, ff_dim_t{nonnegative_int{3}}}, + /*axes=*/{ff_dim_t{1_n}, ff_dim_t{3_n}}, /*elementwise_affine=*/true, /*eps=*/0.1, }; @@ -58,11 +58,11 @@ TEST_SUITE(FF_TEST_SUITE) { }(); TensorShape input = TensorShape{ - TensorDims{FFOrdered{ - 12_n, - 14_n, - 16_n, - 18_n, + TensorDims{FFOrdered{ + 12_p, + 14_p, + 16_p, + 18_p, }}, DataType::FLOAT, }; @@ -70,9 +70,9 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape output = input; TensorShape gamma = TensorShape{ - TensorDims{FFOrdered{ - 12_n, - 16_n, + TensorDims{FFOrdered{ + 12_p, + 16_p, }}, DataType::FLOAT, }; @@ -125,58 +125,58 @@ TEST_SUITE(FF_TEST_SUITE) { auto make_input = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o0, - nonnegative_int o1, - nonnegative_int o2, - nonnegative_int o3) { + positive_int o0, + positive_int o1, + positive_int o2, + positive_int o3) { return lift_to_parallel_with_degrees( - input, o_sum, o_eq, FFOrdered{o0, o1, o2, o3}); + input, o_sum, o_eq, FFOrdered{o0, o1, o2, o3}); }; auto make_output = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o0, - nonnegative_int o1, - nonnegative_int o2, - nonnegative_int o3) { + positive_int o0, + positive_int o1, + positive_int o2, + positive_int o3) { return lift_to_parallel_with_degrees( - output, o_sum, o_eq, FFOrdered{o0, o1, o2, o3}); + output, o_sum, o_eq, FFOrdered{o0, o1, o2, o3}); }; auto make_gamma_weights = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o0, - nonnegative_int o2) { + positive_int o0, + positive_int o2) { return lift_to_parallel_with_degrees( - gamma, o_sum, o_eq, FFOrdered{o0, o2}); + gamma, o_sum, o_eq, FFOrdered{o0, o2}); }; auto make_beta_weights = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o0, - nonnegative_int o2) { + positive_int o0, + positive_int o2) { return lift_to_parallel_with_degrees( - beta, o_sum, o_eq, FFOrdered{o0, o2}); + beta, o_sum, o_eq, FFOrdered{o0, o2}); }; SUBCASE("parallel shape inference (LayerNorm)") { SUBCASE("partition parallelism (not in axes)") { - nonnegative_int degree0 = 2_n; - nonnegative_int degree2 = 3_n; + positive_int degree0 = 2_p; + positive_int degree2 = 3_p; ParallelTensorShape par_input = make_input( - SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2, 1_n); + SumDegree{1_p}, DiscardCopyDegree{1_p}, degree0, 1_p, degree2, 1_p); SUBCASE("get_output_shape(LayerNormAttrs, ParallelTensorShape)") { tl::expected result = get_output_shape(attrs_affine_true, par_input); tl::expected correct = - make_output(SumDegree{1_n}, - DiscardCopyDegree{1_n}, + make_output(SumDegree{1_p}, + DiscardCopyDegree{1_p}, degree0, - 1_n, + 1_p, degree2, - 1_n); + 1_p); CHECK(result == correct); } @@ -188,7 +188,7 @@ TEST_SUITE(FF_TEST_SUITE) { get_gamma_weights_shape(attrs_affine_true, par_input); tl::expected correct = make_gamma_weights( - SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, degree2); + SumDegree{1_p}, DiscardCopyDegree{1_p}, degree0, degree2); CHECK(result == correct); } @@ -208,7 +208,7 @@ TEST_SUITE(FF_TEST_SUITE) { get_beta_weights_shape(attrs_affine_true, par_input); tl::expected correct = make_beta_weights( - SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, degree2); + SumDegree{1_p}, DiscardCopyDegree{1_p}, degree0, degree2); CHECK(result == correct); } @@ -224,11 +224,11 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("partition parallelism (in axes)") { - nonnegative_int degree1 = 2_n; - nonnegative_int degree2 = 4_n; + positive_int degree1 = 2_p; + positive_int degree2 = 4_p; ParallelTensorShape par_input = make_input( - SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree1, degree2, 1_n); + SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, degree1, degree2, 1_p); SUBCASE("get_output_shape(LayerNormAttrs, ParallelTensorShape)") { std::optional result = optional_from_expected( @@ -257,10 +257,10 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("sum parallelism") { - SumDegree sum_degree = SumDegree{2_n}; + SumDegree sum_degree = SumDegree{2_p}; ParallelTensorShape par_input = - make_input(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n, 1_n); + make_input(sum_degree, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p, 1_p); SUBCASE("get_output_shape(LayerNormAttrs, ParallelTensorShape)") { std::optional result = optional_from_expected( @@ -289,10 +289,10 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("discard copy parallelism") { - DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_n}; + DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_p}; ParallelTensorShape par_input = - make_input(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n, 1_n); + make_input(SumDegree{1_p}, discard_copy_degree, 1_p, 1_p, 1_p, 1_p); SUBCASE("get_output_shape(LayerNormAttrs, ParallelTensorShape)") { std::optional result = optional_from_expected( diff --git a/lib/op-attrs/test/src/op-attrs/ops/linear.cc b/lib/op-attrs/test/src/op-attrs/ops/linear.cc index eaa99ef099..1ca936738b 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/linear.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/linear.cc @@ -10,7 +10,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_linear_incoming_tensor_roles(LinearAttrs)") { auto make_attrs = [](bool use_bias) { return LinearAttrs{ - /*out_channels=*/16_n, + /*out_channels=*/16_p, /*use_bias=*/use_bias, /*data_type=*/DataType::FLOAT, /*activation=*/Activation::RELU, @@ -47,7 +47,7 @@ TEST_SUITE(FF_TEST_SUITE) { } TEST_CASE("Linear shape inference") { - nonnegative_int out_channels = 16_n; + positive_int out_channels = 16_p; LinearAttrs attrs = LinearAttrs{ /*out_channels=*/out_channels, /*use_bias=*/true, @@ -56,13 +56,13 @@ TEST_SUITE(FF_TEST_SUITE) { /*regularizer=*/std::nullopt, }; - nonnegative_int batch_size = 12_n; - nonnegative_int extra_dim = 16_n; - nonnegative_int in_channels = 8_n; + positive_int batch_size = 12_p; + positive_int extra_dim = 16_p; + positive_int in_channels = 8_p; TensorShape input = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ batch_size, extra_dim, in_channels, @@ -73,7 +73,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape output = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ batch_size, extra_dim, out_channels, @@ -84,7 +84,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape projection = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ in_channels, out_channels, }, @@ -94,7 +94,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape bias = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ out_channels, }, }, @@ -127,66 +127,66 @@ TEST_SUITE(FF_TEST_SUITE) { auto make_input = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o_batch, - nonnegative_int o_extra_dim, - nonnegative_int o_channel) { + positive_int o_batch, + positive_int o_extra_dim, + positive_int o_channel) { return lift_to_parallel_with_degrees( input, o_sum, o_eq, - FFOrdered{o_batch, o_extra_dim, o_channel}); + FFOrdered{o_batch, o_extra_dim, o_channel}); }; auto make_output = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o_batch, - nonnegative_int o_extra_dim, - nonnegative_int o_channel) { + positive_int o_batch, + positive_int o_extra_dim, + positive_int o_channel) { return lift_to_parallel_with_degrees( output, o_sum, o_eq, - FFOrdered{o_batch, o_extra_dim, o_channel}); + FFOrdered{o_batch, o_extra_dim, o_channel}); }; auto make_projection = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o_inchannel, - nonnegative_int o_outchannel) { + positive_int o_inchannel, + positive_int o_outchannel) { return lift_to_parallel_with_degrees( projection, o_sum, o_eq, - FFOrdered{o_inchannel, o_outchannel}); + FFOrdered{o_inchannel, o_outchannel}); }; auto make_bias = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o_outchannel) { + positive_int o_outchannel) { return lift_to_parallel_with_degrees( - bias, o_sum, o_eq, FFOrdered{o_outchannel}); + bias, o_sum, o_eq, FFOrdered{o_outchannel}); }; SUBCASE("data parallelism") { - nonnegative_int input_sum_degree = 2_n; - nonnegative_int extra_dim_degree = 8_n; - nonnegative_int degree = 4_n; + positive_int input_sum_degree = 2_p; + positive_int extra_dim_degree = 8_p; + positive_int degree = 4_p; ParallelTensorShape par_input = make_input(SumDegree{input_sum_degree}, - DiscardCopyDegree{1_n}, + DiscardCopyDegree{1_p}, degree, extra_dim_degree, - 1_n); + 1_p); { tl::expected result = get_output_shape(attrs, par_input); tl::expected correct = make_output(SumDegree{input_sum_degree}, - DiscardCopyDegree{1_n}, + DiscardCopyDegree{1_p}, degree, extra_dim_degree, - 1_n); + 1_p); CHECK(result == correct); } @@ -195,10 +195,10 @@ TEST_SUITE(FF_TEST_SUITE) { get_projection_shape(attrs, par_input); tl::expected correct = make_projection( - SumDegree{1_n}, + SumDegree{1_p}, DiscardCopyDegree{input_sum_degree * degree * extra_dim_degree}, - 1_n, - 1_n); + 1_p, + 1_p); CHECK(result == correct); } @@ -208,19 +208,19 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected correct = make_bias(SumDegree{input_sum_degree}, DiscardCopyDegree{degree * extra_dim_degree}, - 1_n); + 1_p); CHECK(result == correct); } } SUBCASE("reduction parallelism") { - nonnegative_int input_sum_degree = 2_n; - nonnegative_int degree = 4_n; + positive_int input_sum_degree = 2_p; + positive_int degree = 4_p; ParallelTensorShape par_input = make_input(SumDegree{input_sum_degree}, - DiscardCopyDegree{1_n}, - 1_n, - 1_n, + DiscardCopyDegree{1_p}, + 1_p, + 1_p, degree); { @@ -228,10 +228,10 @@ TEST_SUITE(FF_TEST_SUITE) { get_output_shape(attrs, par_input); tl::expected correct = make_output(SumDegree{input_sum_degree * degree}, - DiscardCopyDegree{1_n}, - 1_n, - 1_n, - 1_n); + DiscardCopyDegree{1_p}, + 1_p, + 1_p, + 1_p); CHECK(result == correct); } @@ -239,10 +239,10 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_projection_shape(attrs, par_input); tl::expected correct = - make_projection(SumDegree{1_n}, + make_projection(SumDegree{1_p}, DiscardCopyDegree{input_sum_degree}, degree, - 1_n); + 1_p); CHECK(result == correct); } @@ -250,29 +250,29 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_bias_shape(attrs, par_input); tl::expected correct = make_bias( - SumDegree{input_sum_degree * degree}, DiscardCopyDegree{1_n}, 1_n); + SumDegree{input_sum_degree * degree}, DiscardCopyDegree{1_p}, 1_p); CHECK(result == correct); } } SUBCASE("output channel parallelism") { - nonnegative_int input_sum_degree = 2_n; - nonnegative_int degree = 4_n; + positive_int input_sum_degree = 2_p; + positive_int degree = 4_p; ParallelTensorShape par_input = make_input(SumDegree{input_sum_degree}, DiscardCopyDegree{degree}, - 1_n, - 1_n, - 1_n); + 1_p, + 1_p, + 1_p); { tl::expected result = get_output_shape(attrs, par_input); tl::expected correct = make_output(SumDegree{input_sum_degree}, - DiscardCopyDegree{1_n}, - 1_n, - 1_n, + DiscardCopyDegree{1_p}, + 1_p, + 1_p, degree); CHECK(result == correct); } @@ -281,9 +281,9 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_projection_shape(attrs, par_input); tl::expected correct = - make_projection(SumDegree{1_n}, + make_projection(SumDegree{1_p}, DiscardCopyDegree{input_sum_degree}, - 1_n, + 1_p, degree); CHECK(result == correct); } @@ -292,7 +292,7 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_bias_shape(attrs, par_input); tl::expected correct = make_bias( - SumDegree{input_sum_degree}, DiscardCopyDegree{1_n}, degree); + SumDegree{input_sum_degree}, DiscardCopyDegree{1_p}, degree); CHECK(result == correct); } } diff --git a/lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc b/lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc index 6c14a226a2..9a27aafa5b 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc @@ -9,25 +9,25 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("make_adaptive_pool2d") { - nonnegative_int input_n = 10_n; - nonnegative_int input_c = 11_n; - nonnegative_int input_h = 15_n; - nonnegative_int input_w = 20_n; + positive_int input_n = 10_p; + positive_int input_c = 11_p; + positive_int input_h = 15_p; + positive_int input_w = 20_p; Activation activation = Activation::RELU; PoolOp op = PoolOp::AVG; TensorDims input_dims = TensorDims{ - FFOrdered{input_n, input_c, input_h, input_w}}; + FFOrdered{input_n, input_c, input_h, input_w}}; SUBCASE("input_h divisible by output_h && input_w divisible by output_w") { - nonnegative_int output_h = 5_n; - nonnegative_int output_w = 2_n; + positive_int output_h = 5_p; + positive_int output_w = 2_p; Pool2DAttrs correct_attrs = Pool2DAttrs{ - /*kernel_h=*/3_n, - /*kernel_w=*/10_n, - /*stride_h=*/3_n, - /*stride_w=*/10_n, + /*kernel_h=*/3_p, + /*kernel_w=*/10_p, + /*stride_h=*/3_p, + /*stride_w=*/10_p, /*padding_h=*/0_n, /*padding_w=*/0_n, /*pool_type=*/op, @@ -50,7 +50,7 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_output_shape(correct_attrs, input_shape); tl::expected correct = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ input_n, input_c, output_h, @@ -64,8 +64,8 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("input_h not divisible by output_h") { - nonnegative_int output_h = 6_n; - nonnegative_int output_w = 2_n; + positive_int output_h = 6_p; + positive_int output_w = 2_p; std::optional result = optional_from_expected(make_adaptive_pool2d_attrs( @@ -76,8 +76,8 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("input_w not divisible by output_w") { - nonnegative_int output_h = 5_n; - nonnegative_int output_w = 3_n; + positive_int output_h = 5_p; + positive_int output_w = 3_p; std::optional result = optional_from_expected(make_adaptive_pool2d_attrs( @@ -88,14 +88,14 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("input_h == output_h and input_w == output_w") { - nonnegative_int output_h = input_h; - nonnegative_int output_w = input_w; + positive_int output_h = input_h; + positive_int output_w = input_w; Pool2DAttrs correct_attrs = Pool2DAttrs{ - /*kernel_h=*/1_n, - /*kernel_w=*/1_n, - /*stride_h=*/1_n, - /*stride_w=*/1_n, + /*kernel_h=*/1_p, + /*kernel_w=*/1_p, + /*stride_h=*/1_p, + /*stride_w=*/1_p, /*padding_h=*/0_n, /*padding_w=*/0_n, /*pool_type=*/op, @@ -126,10 +126,10 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_output_shape(Pool2DAttrs, TensorShape)") { Pool2DAttrs attrs = Pool2DAttrs{ - /*kernel_h=*/3_n, - /*kernel_w=*/2_n, - /*stride_h=*/2_n, - /*stride_w=*/2_n, + /*kernel_h=*/3_p, + /*kernel_w=*/2_p, + /*stride_h=*/2_p, + /*stride_w=*/2_p, /*padding_h=*/1_n, /*padding_w=*/1_n, /*pool_type=*/PoolOp::MAX, @@ -138,10 +138,10 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("fails on non-4d inputs") { TensorShape input = TensorShape{ - TensorDims{FFOrdered{ - 10_n, - 12_n, - 14_n, + TensorDims{FFOrdered{ + 10_p, + 12_p, + 14_p, }}, DataType::FLOAT, }; @@ -155,14 +155,14 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("4d input") { TensorShape input = TensorShape{ - TensorDims{FFOrdered{11_n, 13_n, 12_n, 6_n}}, + TensorDims{FFOrdered{11_p, 13_p, 12_p, 6_p}}, DataType::FLOAT, }; tl::expected result = get_output_shape(attrs, input); tl::expected correct = TensorShape{ - TensorDims{FFOrdered{11_n, 13_n, 6_n, 4_n}}, + TensorDims{FFOrdered{11_p, 13_p, 6_p, 4_p}}, DataType::FLOAT, }; @@ -175,10 +175,10 @@ TEST_SUITE(FF_TEST_SUITE) { auto make_attrs = [](PoolOp pool_type, std::optional const &activation) { return Pool2DAttrs{ - /*kernel_h=*/3_n, - /*kernel_w=*/2_n, - /*stride_h=*/2_n, - /*stride_w=*/2_n, + /*kernel_h=*/3_p, + /*kernel_w=*/2_p, + /*stride_h=*/2_p, + /*stride_w=*/2_p, /*padding_h=*/1_n, /*padding_w=*/1_n, /*pool_type=*/pool_type, @@ -190,13 +190,13 @@ TEST_SUITE(FF_TEST_SUITE) { Pool2DAttrs attrs = make_attrs(PoolOp::MAX, /*activation=*/std::nullopt); ParallelTensorDimDegrees input = ParallelTensorDimDegrees{ - SumDegree{1_n}, - DiscardCopyDegree{1_n}, - FFOrdered{ - 4_n, - 1_n, - 1_n, - 1_n, + SumDegree{1_p}, + DiscardCopyDegree{1_p}, + FFOrdered{ + 4_p, + 1_p, + 1_p, + 1_p, }, }; @@ -211,13 +211,13 @@ TEST_SUITE(FF_TEST_SUITE) { Pool2DAttrs attrs = make_attrs(PoolOp::MAX, /*activation=*/std::nullopt); ParallelTensorDimDegrees input = ParallelTensorDimDegrees{ - SumDegree{1_n}, - DiscardCopyDegree{1_n}, - FFOrdered{ - 4_n, - 2_n, - 5_n, - 6_n, + SumDegree{1_p}, + DiscardCopyDegree{1_p}, + FFOrdered{ + 4_p, + 2_p, + 5_p, + 6_p, }, }; @@ -232,13 +232,13 @@ TEST_SUITE(FF_TEST_SUITE) { Pool2DAttrs attrs = make_attrs(PoolOp::MAX, /*activation=*/std::nullopt); ParallelTensorDimDegrees input = ParallelTensorDimDegrees{ - SumDegree{1_n}, - DiscardCopyDegree{3_n}, - FFOrdered{ - 1_n, - 1_n, - 1_n, - 1_n, + SumDegree{1_p}, + DiscardCopyDegree{3_p}, + FFOrdered{ + 1_p, + 1_p, + 1_p, + 1_p, }, }; @@ -256,13 +256,13 @@ TEST_SUITE(FF_TEST_SUITE) { make_attrs(PoolOp::MAX, /*activation=*/std::nullopt); ParallelTensorDimDegrees input = ParallelTensorDimDegrees{ - SumDegree{2_n}, - DiscardCopyDegree{1_n}, - FFOrdered{ - 1_n, - 1_n, - 1_n, - 1_n, + SumDegree{2_p}, + DiscardCopyDegree{1_p}, + FFOrdered{ + 1_p, + 1_p, + 1_p, + 1_p, }, }; @@ -279,13 +279,13 @@ TEST_SUITE(FF_TEST_SUITE) { make_attrs(PoolOp::AVG, /*activation=*/std::nullopt); ParallelTensorDimDegrees input = ParallelTensorDimDegrees{ - SumDegree{2_n}, - DiscardCopyDegree{1_n}, - FFOrdered{ - 1_n, - 1_n, - 1_n, - 1_n, + SumDegree{2_p}, + DiscardCopyDegree{1_p}, + FFOrdered{ + 1_p, + 1_p, + 1_p, + 1_p, }, }; @@ -302,13 +302,13 @@ TEST_SUITE(FF_TEST_SUITE) { make_attrs(PoolOp::AVG, /*activation=*/Activation::RELU); ParallelTensorDimDegrees input = ParallelTensorDimDegrees{ - SumDegree{2_n}, - DiscardCopyDegree{1_n}, - FFOrdered{ - 1_n, - 1_n, - 1_n, - 1_n, + SumDegree{2_p}, + DiscardCopyDegree{1_p}, + FFOrdered{ + 1_p, + 1_p, + 1_p, + 1_p, }, }; @@ -326,10 +326,10 @@ TEST_SUITE(FF_TEST_SUITE) { // just do a single test to make sure it works/exists Pool2DAttrs attrs = Pool2DAttrs{ - /*kernel_h=*/3_n, - /*kernel_w=*/2_n, - /*stride_h=*/2_n, - /*stride_w=*/2_n, + /*kernel_h=*/3_p, + /*kernel_w=*/2_p, + /*stride_h=*/2_p, + /*stride_w=*/2_p, /*padding_h=*/1_n, /*padding_w=*/1_n, /*pool_type=*/PoolOp::MAX, @@ -340,14 +340,14 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape input = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{14_n, 7_n}, - ShardParallelDim{16_n, 8_n}, - ShardParallelDim{12_n, 3_n}, - ShardParallelDim{6_n, 2_n}, + ShardParallelDim{14_p, 7_p}, + ShardParallelDim{16_p, 8_p}, + ShardParallelDim{12_p, 3_p}, + ShardParallelDim{6_p, 2_p}, }, ReplicaParallelDimSet{ - SumDegree{1_n}, - DiscardCopyDegree{2_n}, + SumDegree{1_p}, + DiscardCopyDegree{2_p}, }, }, DataType::FLOAT, @@ -359,14 +359,14 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{14_n, 7_n}, - ShardParallelDim{16_n, 8_n}, - ShardParallelDim{6_n, 3_n}, - ShardParallelDim{4_n, 2_n}, + ShardParallelDim{14_p, 7_p}, + ShardParallelDim{16_p, 8_p}, + ShardParallelDim{6_p, 3_p}, + ShardParallelDim{4_p, 2_p}, }, ReplicaParallelDimSet{ - SumDegree{1_n}, - DiscardCopyDegree{2_n}, + SumDegree{1_p}, + DiscardCopyDegree{2_p}, }, }, DataType::FLOAT, @@ -377,14 +377,14 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape input = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{14_n, 1_n}, - ShardParallelDim{16_n, 1_n}, - ShardParallelDim{12_n, 1_n}, - ShardParallelDim{6_n, 1_n}, + ShardParallelDim{14_p, 1_p}, + ShardParallelDim{16_p, 1_p}, + ShardParallelDim{12_p, 1_p}, + ShardParallelDim{6_p, 1_p}, }, ReplicaParallelDimSet{ - SumDegree{2_n}, - DiscardCopyDegree{1_n}, + SumDegree{2_p}, + DiscardCopyDegree{1_p}, }, }, DataType::FLOAT, diff --git a/lib/op-attrs/test/src/op-attrs/ops/reduction.cc b/lib/op-attrs/test/src/op-attrs/ops/reduction.cc index dc12eb12a8..a480c840a3 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/reduction.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/reduction.cc @@ -10,21 +10,21 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape input = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{12_n, 2_n}, - ShardParallelDim{14_n, 1_n}, - ShardParallelDim{16_n, 3_n}, - ShardParallelDim{18_n, 2_n}, + ShardParallelDim{12_p, 2_p}, + ShardParallelDim{14_p, 1_p}, + ShardParallelDim{16_p, 3_p}, + ShardParallelDim{18_p, 2_p}, }, ReplicaParallelDimSet{ - SumDegree{3_n}, - DiscardCopyDegree{2_n}, + SumDegree{3_p}, + DiscardCopyDegree{2_p}, }, }, DataType::FLOAT, }; SUBCASE("valid") { - nonnegative_int degree = 3_n; + positive_int degree = 3_p; ReductionAttrs attrs = ReductionAttrs{ /*repartition_degree=*/degree, }; @@ -34,7 +34,8 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected correct = [&] { ParallelTensorShape output = input; - output.dims.replica_dims.sum_degree.value /= degree; + positive_int old_sum_degree = output.dims.replica_dims.sum_degree.value; + output.dims.replica_dims.sum_degree.value = positive_int{old_sum_degree / degree}; return output; }(); @@ -42,7 +43,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("invalid") { - nonnegative_int degree = 4_n; + positive_int degree = 4_p; ReductionAttrs attrs = ReductionAttrs{ /*repartition_degree=*/degree, }; diff --git a/lib/op-attrs/test/src/op-attrs/ops/repartition.cc b/lib/op-attrs/test/src/op-attrs/ops/repartition.cc index 36a265ce9f..3743cebc31 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/repartition.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/repartition.cc @@ -7,7 +7,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Repartition shape inference") { ff_dim_t dim = ff_dim_t{2_n}; - nonnegative_int degree = 4_n; + positive_int degree = 4_p; RepartitionAttrs attrs = RepartitionAttrs{ /*repartition_dim=*/dim, /*repartition_degree=*/degree, @@ -16,14 +16,14 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape input = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{12_n, 2_n}, - ShardParallelDim{14_n, 1_n}, - ShardParallelDim{16_n, 3_n}, - ShardParallelDim{18_n, 2_n}, + ShardParallelDim{12_p, 2_p}, + ShardParallelDim{14_p, 1_p}, + ShardParallelDim{16_p, 3_p}, + ShardParallelDim{18_p, 2_p}, }, ReplicaParallelDimSet{ - SumDegree{3_n}, - DiscardCopyDegree{2_n}, + SumDegree{3_p}, + DiscardCopyDegree{2_p}, }, }, DataType::FLOAT, diff --git a/lib/op-attrs/test/src/op-attrs/ops/replicate.cc b/lib/op-attrs/test/src/op-attrs/ops/replicate.cc index 770ae20d38..11ac7c02ab 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/replicate.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/replicate.cc @@ -6,20 +6,20 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Replicate shape inference") { ReplicateAttrs attrs = ReplicateAttrs{ - /*replicate_degree=*/4_n, + /*replicate_degree=*/4_p, }; ParallelTensorShape input = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ - ShardParallelDim{10_n, 2_n}, - ShardParallelDim{12_n, 1_n}, - ShardParallelDim{14_n, 2_n}, - ShardParallelDim{16_n, 2_n}, + ShardParallelDim{10_p, 2_p}, + ShardParallelDim{12_p, 1_p}, + ShardParallelDim{14_p, 2_p}, + ShardParallelDim{16_p, 2_p}, }, ReplicaParallelDimSet{ - SumDegree{3_n}, - DiscardCopyDegree{2_n}, + SumDegree{3_p}, + DiscardCopyDegree{2_p}, }, }, DataType::FLOAT, @@ -29,7 +29,7 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape correct_output = input; correct_output.dims.replica_dims.discard_copy_degree = - DiscardCopyDegree{8_n}; + DiscardCopyDegree{8_p}; CHECK(result == correct_output); } diff --git a/lib/op-attrs/test/src/op-attrs/ops/softmax.cc b/lib/op-attrs/test/src/op-attrs/ops/softmax.cc index 8c80e348c0..29507565e8 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/softmax.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/softmax.cc @@ -10,10 +10,10 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_output_shape(SoftmaxAttrs, TensorShape)") { TensorShape input = TensorShape{ - TensorDims{FFOrdered{ - 12_n, - 14_n, - 16_n, + TensorDims{FFOrdered{ + 12_p, + 14_p, + 16_p, }}, DataType::FLOAT, }; @@ -41,10 +41,10 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_output_shape(SoftmaxAttrs, ParallelTensorShape)") { TensorShape input = TensorShape{ - TensorDims{FFOrdered{ - 12_n, - 14_n, - 16_n, + TensorDims{FFOrdered{ + 12_p, + 14_p, + 16_p, }}, DataType::FLOAT, }; @@ -52,28 +52,28 @@ TEST_SUITE(FF_TEST_SUITE) { auto make_input = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o0, - nonnegative_int o1, - nonnegative_int o2) { + positive_int o0, + positive_int o1, + positive_int o2) { return lift_to_parallel_with_degrees( - input, o_sum, o_eq, FFOrdered{o0, o1, o2}); + input, o_sum, o_eq, FFOrdered{o0, o1, o2}); }; auto make_output = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - nonnegative_int o0, - nonnegative_int o1, - nonnegative_int o2) { + positive_int o0, + positive_int o1, + positive_int o2) { return lift_to_parallel_with_degrees( - output, o_sum, o_eq, FFOrdered{o0, o1, o2}); + output, o_sum, o_eq, FFOrdered{o0, o1, o2}); }; SUBCASE("partition parallelism in non-softmax-dim (valid)") { - nonnegative_int degree0 = 2_n; - nonnegative_int degree2 = 4_n; + positive_int degree0 = 2_p; + positive_int degree2 = 4_p; ParallelTensorShape par_input = make_input( - SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2); + SumDegree{1_p}, DiscardCopyDegree{1_p}, degree0, 1_p, degree2); SUBCASE("attrs.dim in bounds") { SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{1_n}}; @@ -81,7 +81,7 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected result = get_output_shape(attrs, par_input); tl::expected correct = make_output( - SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2); + SumDegree{1_p}, DiscardCopyDegree{1_p}, degree0, 1_p, degree2); CHECK(result == correct); } @@ -98,12 +98,12 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("partition parallism in softmax dim (invalid)") { - nonnegative_int degree1 = 2_n; + positive_int degree1 = 2_p; SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{1_n}}; ParallelTensorShape par_input = - make_input(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree1, 1_n); + make_input(SumDegree{1_p}, DiscardCopyDegree{1_p}, 1_p, degree1, 1_p); std::optional result = optional_from_expected(get_output_shape(attrs, par_input)); @@ -113,12 +113,12 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("sum parallelism (invalid)") { - SumDegree sum_degree = SumDegree{2_n}; + SumDegree sum_degree = SumDegree{2_p}; SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{1_n}}; ParallelTensorShape par_input = - make_input(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n); + make_input(sum_degree, DiscardCopyDegree{1_p}, 1_p, 1_p, 1_p); std::optional result = optional_from_expected(get_output_shape(attrs, par_input)); @@ -128,12 +128,12 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("discard copy parallelism (invalid)") { - DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_n}; + DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_p}; SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{1_n}}; ParallelTensorShape par_input = - make_input(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n); + make_input(SumDegree{1_p}, discard_copy_degree, 1_p, 1_p, 1_p); std::optional result = optional_from_expected(get_output_shape(attrs, par_input)); diff --git a/lib/op-attrs/test/src/op-attrs/pcg_operator_attrs.cc b/lib/op-attrs/test/src/op-attrs/pcg_operator_attrs.cc index 1187bfcfbf..f3d629cad8 100644 --- a/lib/op-attrs/test/src/op-attrs/pcg_operator_attrs.cc +++ b/lib/op-attrs/test/src/op-attrs/pcg_operator_attrs.cc @@ -7,7 +7,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("PCGOperatorAttrs to/from json") { PCGOperatorAttrs correct = PCGOperatorAttrs{RepartitionAttrs{ /*repartition_dim=*/ff_dim_t{1_n}, - /*repartition_degree=*/4_n, + /*repartition_degree=*/4_p, }}; nlohmann::json j = correct; auto result = j.get(); diff --git a/lib/op-attrs/test/src/op-attrs/tensor_dims.cc b/lib/op-attrs/test/src/op-attrs/tensor_dims.cc index 7e072d82d9..044b50fae2 100644 --- a/lib/op-attrs/test/src/op-attrs/tensor_dims.cc +++ b/lib/op-attrs/test/src/op-attrs/tensor_dims.cc @@ -8,7 +8,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("tensor_dims_is_broadcastable_to(TensorDims, TensorDims)") { TensorDims goal = - TensorDims{FFOrdered{1_n, 1_n, 4_n, 3_n}}; + TensorDims{FFOrdered{1_p, 1_p, 4_p, 3_p}}; SUBCASE("dims match") { bool result = tensor_dims_is_broadcastable_to(goal, goal); @@ -18,7 +18,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("curr only needs num_dims promotion") { - TensorDims curr = TensorDims{FFOrdered{4_n, 3_n}}; + TensorDims curr = TensorDims{FFOrdered{4_p, 3_p}}; bool result = tensor_dims_is_broadcastable_to(curr, goal); bool correct = true; @@ -28,7 +28,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("curr only needs dim expansion") { TensorDims curr = - TensorDims{FFOrdered{1_n, 1_n, 1_n, 3_n}}; + TensorDims{FFOrdered{1_p, 1_p, 1_p, 3_p}}; bool result = tensor_dims_is_broadcastable_to(curr, goal); bool correct = true; @@ -37,7 +37,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("curr needs both num_dims promotion and dim expansion") { - TensorDims curr = TensorDims{FFOrdered{1_n, 3_n}}; + TensorDims curr = TensorDims{FFOrdered{1_p, 3_p}}; bool result = tensor_dims_is_broadcastable_to(curr, goal); bool correct = true; @@ -47,7 +47,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("curr needs invalid dim promotion") { TensorDims curr = - TensorDims{FFOrdered{1_n, 1_n, 2_n, 3_n}}; + TensorDims{FFOrdered{1_p, 1_p, 2_p, 3_p}}; bool result = tensor_dims_is_broadcastable_to(curr, goal); bool correct = false; @@ -57,7 +57,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("num_dims(goal) < num_dims(curr)") { TensorDims curr = - TensorDims{FFOrdered{1_n, 1_n, 10_n, 4_n, 3_n}}; + TensorDims{FFOrdered{1_p, 1_p, 10_p, 4_p, 3_p}}; bool result = tensor_dims_is_broadcastable_to(curr, goal); bool correct = false; @@ -67,13 +67,13 @@ TEST_SUITE(FF_TEST_SUITE) { } TEST_CASE("get_broadcast_target_dims(std::unordered_set)") { - TensorDims d1 = TensorDims{FFOrdered{1_n, 10_n, 4_n, 3_n}}; + TensorDims d1 = TensorDims{FFOrdered{1_p, 10_p, 4_p, 3_p}}; - TensorDims d2 = TensorDims{FFOrdered{10_n, 4_n, 1_n}}; + TensorDims d2 = TensorDims{FFOrdered{10_p, 4_p, 1_p}}; SUBCASE("has target in inputs") { TensorDims d3 = - TensorDims{FFOrdered{1_n, 1_n, 4_n, 3_n}}; + TensorDims{FFOrdered{1_p, 1_p, 4_p, 3_p}}; std::optional result = get_broadcast_target_dims({d1, d2, d3}); @@ -84,7 +84,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("has no possible target") { TensorDims d3 = - TensorDims{FFOrdered{1_n, 1_n, 1_n, 4_n}}; + TensorDims{FFOrdered{1_p, 1_p, 1_p, 4_p}}; std::optional result = get_broadcast_target_dims({d1, d2, d3}); @@ -95,10 +95,10 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("has possible target, but not in inputs") { TensorDims d3 = - TensorDims{FFOrdered{1_n, 1_n, 1_n, 4_n, 3_n}}; + TensorDims{FFOrdered{1_p, 1_p, 1_p, 4_p, 3_p}}; TensorDims possible_target = - TensorDims{FFOrdered{1_n, 1_n, 10_n, 4_n, 3_n}}; + TensorDims{FFOrdered{1_p, 1_p, 10_p, 4_p, 3_p}}; REQUIRE(tensor_dims_is_broadcastable_to(d1, possible_target)); REQUIRE(tensor_dims_is_broadcastable_to(d2, possible_target)); diff --git a/lib/pcg/include/pcg/computation_graph_builder.h b/lib/pcg/include/pcg/computation_graph_builder.h index b996026ce7..2be2a54cd8 100644 --- a/lib/pcg/include/pcg/computation_graph_builder.h +++ b/lib/pcg/include/pcg/computation_graph_builder.h @@ -84,15 +84,15 @@ struct ComputationGraphBuilder { // Add a 2D convolutional layer tensor_guid_t conv2d( tensor_guid_t const &input, - nonnegative_int outChannels, - nonnegative_int kernelH, - nonnegative_int kernelW, - nonnegative_int strideH, - nonnegative_int strideW, + positive_int outChannels, + positive_int kernelH, + positive_int kernelW, + positive_int strideH, + positive_int strideW, nonnegative_int paddingH, nonnegative_int paddingW, std::optional const &activation = std::nullopt, - nonnegative_int groups = 1_n, + positive_int groups = 1_p, bool use_bias = true, std::optional const &kernel_initializer = std::nullopt, std::optional const &bias_initializer = std::nullopt, @@ -106,8 +106,8 @@ struct ComputationGraphBuilder { // Add an embedding layer tensor_guid_t embedding( tensor_guid_t const &input, - nonnegative_int num_entries, - nonnegative_int outDim, + positive_int num_entries, + positive_int outDim, AggregateOp aggr, DataType dtype = DataType::FLOAT, std::optional const &initializer = std::nullopt, @@ -127,10 +127,10 @@ struct ComputationGraphBuilder { // Add a 2D pooling layer tensor_guid_t pool2d(tensor_guid_t const &input, - nonnegative_int kernelH, - nonnegative_int kernelW, - nonnegative_int strideH, - nonnegative_int strideW, + positive_int kernelH, + positive_int kernelW, + positive_int strideH, + positive_int strideW, nonnegative_int paddingH, nonnegative_int paddingW, PoolOp type = PoolOp::MAX, @@ -138,8 +138,8 @@ struct ComputationGraphBuilder { std::optional const &name = std::nullopt); tensor_guid_t adaptive_pool2d( tensor_guid_t const &input, - nonnegative_int output_h, - nonnegative_int output_w, + positive_int output_h, + positive_int output_w, PoolOp type = PoolOp::MAX, std::optional const &activation = std::nullopt, std::optional const &name = std::nullopt); @@ -164,7 +164,7 @@ struct ComputationGraphBuilder { std::optional const &name = std::nullopt); tensor_guid_t dense( tensor_guid_t const &input, - nonnegative_int outDim, + positive_int outDim, std::optional activation = std::nullopt, bool use_bias = true, DataType data_type = DataType::FLOAT, @@ -226,10 +226,10 @@ struct ComputationGraphBuilder { tensor_guid_t const &query, tensor_guid_t const &key, tensor_guid_t const &value, - nonnegative_int embed_dim, - nonnegative_int num_heads, - nonnegative_int kdim = 0_n, - nonnegative_int vdim = 0_n, + positive_int embed_dim, + positive_int num_heads, + std::optional const &kdim = std::nullopt, + std::optional const &vdim = std::nullopt, float dropout = 0.0f, bool bias = true, bool add_bias_kv = false, diff --git a/lib/pcg/include/pcg/machine_specification.h b/lib/pcg/include/pcg/machine_specification.h index 11c5a81bba..863d9909c0 100644 --- a/lib/pcg/include/pcg/machine_specification.h +++ b/lib/pcg/include/pcg/machine_specification.h @@ -8,11 +8,11 @@ namespace FlexFlow { -nonnegative_int get_num_gpus(MachineSpecification const &ms); -nonnegative_int get_num_cpus(MachineSpecification const &ms); -nonnegative_int get_num_devices(MachineSpecification const &ms, +positive_int get_num_gpus(MachineSpecification const &ms); +positive_int get_num_cpus(MachineSpecification const &ms); +positive_int get_num_devices(MachineSpecification const &ms, DeviceType const &device_type); -nonnegative_int get_num_devices_per_node(MachineSpecification const &ms, +positive_int get_num_devices_per_node(MachineSpecification const &ms, DeviceType const &device_type); bool is_valid_machine_space_coordinate(MachineSpecification const &ms, diff --git a/lib/pcg/include/pcg/machine_specification.struct.toml b/lib/pcg/include/pcg/machine_specification.struct.toml index 7c624c7240..49e9bd9d78 100644 --- a/lib/pcg/include/pcg/machine_specification.struct.toml +++ b/lib/pcg/include/pcg/machine_specification.struct.toml @@ -10,20 +10,20 @@ features = [ ] includes = [ - "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", ] [[fields]] name = "num_nodes" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "num_cpus_per_node" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "num_gpus_per_node" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" [[fields]] name = "inter_node_bandwidth" diff --git a/lib/pcg/include/pcg/operator_task_space.h b/lib/pcg/include/pcg/operator_task_space.h index b095fad088..ceb0146f15 100644 --- a/lib/pcg/include/pcg/operator_task_space.h +++ b/lib/pcg/include/pcg/operator_task_space.h @@ -17,7 +17,7 @@ TaskSpaceCoordinate get_task_space_maximum_coordinate(OperatorTaskSpace const &task); nonnegative_int num_dims(OperatorTaskSpace const &task); -nonnegative_int num_tasks(OperatorTaskSpace const &task); +positive_int num_tasks(OperatorTaskSpace const &task); OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg, parallel_layer_guid_t const &layer); diff --git a/lib/pcg/include/pcg/operator_task_space.struct.toml b/lib/pcg/include/pcg/operator_task_space.struct.toml index 9cc4f6b93a..389e12e8f2 100644 --- a/lib/pcg/include/pcg/operator_task_space.struct.toml +++ b/lib/pcg/include/pcg/operator_task_space.struct.toml @@ -5,13 +5,13 @@ features = [ "ord", "hash", "json", - # "rapidcheck", + "rapidcheck", "fmt", ] includes = [ "", - "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", ] src_includes = [ @@ -21,4 +21,4 @@ src_includes = [ [[fields]] name = "degrees" -type = "std::vector<::FlexFlow::nonnegative_int>" +type = "std::vector<::FlexFlow::positive_int>" diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h index d4cace4a2a..aad2770101 100644 --- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h +++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h @@ -32,15 +32,15 @@ struct ParallelComputationGraphBuilder { parallel_tensor_guid_t conv2d( parallel_tensor_guid_t const &input, - nonnegative_int outChannels, - nonnegative_int kernelH, - nonnegative_int kernelW, - nonnegative_int strideH, - nonnegative_int strideW, + positive_int outChannels, + positive_int kernelH, + positive_int kernelW, + positive_int strideH, + positive_int strideW, nonnegative_int paddingH, nonnegative_int paddingW, std::optional const &activation = std::nullopt, - nonnegative_int groups = 1_n, + positive_int groups = 1_p, bool use_bias = true, std::optional const &kernel_initializer = std::nullopt, std::optional const &bias_initializer = std::nullopt, @@ -49,7 +49,7 @@ struct ParallelComputationGraphBuilder { parallel_tensor_guid_t dense( parallel_tensor_guid_t const &input, - nonnegative_int outDim, + positive_int outDim, std::optional activation = std::nullopt, bool use_bias = true, DataType data_type = DataType::FLOAT, @@ -60,8 +60,8 @@ struct ParallelComputationGraphBuilder { parallel_tensor_guid_t embedding( parallel_tensor_guid_t const &input, - nonnegative_int num_entries, - nonnegative_int outDim, + positive_int num_entries, + positive_int outDim, AggregateOp aggr, DataType dtype = DataType::FLOAT, std::optional const &kernel_initializer = std::nullopt, @@ -71,10 +71,10 @@ struct ParallelComputationGraphBuilder { parallel_tensor_guid_t const &query, parallel_tensor_guid_t const &key, parallel_tensor_guid_t const &value, - nonnegative_int embed_dim, - nonnegative_int num_heads, - std::optional kdim = std::nullopt, - std::optional vdim = std::nullopt, + positive_int embed_dim, + positive_int num_heads, + std::optional kdim = std::nullopt, + std::optional vdim = std::nullopt, float dropout = 0.0f, bool bias = true, bool add_bias_kv = false, @@ -119,20 +119,20 @@ struct ParallelComputationGraphBuilder { parallel_tensor_guid_t parallel_partition(parallel_tensor_guid_t const &input, ff_dim_t dim, - nonnegative_int degree, + positive_int degree, std::optional const &name = std::nullopt); parallel_tensor_guid_t parallel_combine(parallel_tensor_guid_t const &x, ff_dim_t dim, - nonnegative_int degree, + positive_int degree, std::optional const &name = std::nullopt); parallel_tensor_guid_t parallel_replicate(parallel_tensor_guid_t const &x, - nonnegative_int degree, + positive_int degree, std::optional const &name = std::nullopt); parallel_tensor_guid_t parallel_reduce(parallel_tensor_guid_t const &x, - nonnegative_int degree, + positive_int degree, std::optional const &name = std::nullopt); ParallelTensorShape get_shape(parallel_tensor_guid_t const &) const; diff --git a/lib/pcg/include/pcg/stride_t.struct.toml b/lib/pcg/include/pcg/stride_t.struct.toml index 8d950c5f39..3f07ec6b01 100644 --- a/lib/pcg/include/pcg/stride_t.struct.toml +++ b/lib/pcg/include/pcg/stride_t.struct.toml @@ -10,9 +10,9 @@ features = [ ] includes = [ - "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", ] [[fields]] name = "unwrapped" -type = "::FlexFlow::nonnegative_int" +type = "::FlexFlow::positive_int" diff --git a/lib/pcg/src/pcg/computation_graph_builder.cc b/lib/pcg/src/pcg/computation_graph_builder.cc index 267f05499c..0a24acc6aa 100644 --- a/lib/pcg/src/pcg/computation_graph_builder.cc +++ b/lib/pcg/src/pcg/computation_graph_builder.cc @@ -378,15 +378,15 @@ tensor_guid_t tensor_guid_t ComputationGraphBuilder::conv2d( tensor_guid_t const &x, - nonnegative_int outChannels, - nonnegative_int kernelH, - nonnegative_int kernelW, - nonnegative_int strideH, - nonnegative_int strideW, + positive_int outChannels, + positive_int kernelH, + positive_int kernelW, + positive_int strideH, + positive_int strideW, nonnegative_int paddingH, nonnegative_int paddingW, std::optional const &activation, - nonnegative_int groups, + positive_int groups, bool use_bias, std::optional const &maybe_kernel_initializer, std::optional const &maybe_bias_initializer, @@ -440,8 +440,8 @@ tensor_guid_t ComputationGraphBuilder::dropout( tensor_guid_t ComputationGraphBuilder::embedding( tensor_guid_t const &input, - nonnegative_int num_entries, - nonnegative_int outDim, + positive_int num_entries, + positive_int outDim, AggregateOp aggr, DataType dtype, std::optional const &initializer, @@ -491,10 +491,10 @@ tensor_guid_t ComputationGraphBuilder::gather( } tensor_guid_t ComputationGraphBuilder::pool2d( tensor_guid_t const &x, - nonnegative_int kernelH, - nonnegative_int kernelW, - nonnegative_int strideH, - nonnegative_int strideW, + positive_int kernelH, + positive_int kernelW, + positive_int strideH, + positive_int strideW, nonnegative_int paddingH, nonnegative_int paddingW, PoolOp type, @@ -525,8 +525,8 @@ tensor_guid_t ComputationGraphBuilder::pool2d( tensor_guid_t ComputationGraphBuilder::adaptive_pool2d( tensor_guid_t const &uncasted_input, - nonnegative_int output_h, - nonnegative_int output_w, + positive_int output_h, + positive_int output_w, PoolOp type, std::optional const &activation, std::optional const &maybe_name) { @@ -591,10 +591,10 @@ tensor_guid_t ComputationGraphBuilder::multihead_attention( tensor_guid_t const &query, tensor_guid_t const &key, tensor_guid_t const &value, - nonnegative_int embed_dim, - nonnegative_int num_heads, - nonnegative_int kdim, - nonnegative_int vdim, + positive_int embed_dim, + positive_int num_heads, + std::optional const &kdim, + std::optional const &vdim, float dropout, bool bias, bool add_bias_kv, @@ -619,8 +619,8 @@ tensor_guid_t ComputationGraphBuilder::multihead_attention( MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{ /*embed_dim=*/embed_dim, /*num_heads=*/num_heads, - /*kdim=*/kdim, - /*vdim=*/vdim, + /*kdim=*/kdim.value_or(embed_dim), + /*vdim=*/vdim.value_or(embed_dim), /*dropout=*/dropout, /*bias=*/bias, /*add_bias_kv=*/add_bias_kv, @@ -667,7 +667,7 @@ TensorDims ComputationGraphBuilder::get_broadcast_target_dims( tensor_guid_t ComputationGraphBuilder::dense( tensor_guid_t const &input, - nonnegative_int outDim, + positive_int outDim, std::optional activation, bool use_bias, DataType data_type, diff --git a/lib/pcg/src/pcg/machine_specification.cc b/lib/pcg/src/pcg/machine_specification.cc index 0fefeddd27..08afa415af 100644 --- a/lib/pcg/src/pcg/machine_specification.cc +++ b/lib/pcg/src/pcg/machine_specification.cc @@ -2,17 +2,18 @@ #include "pcg/device_id.h" #include "utils/containers/transform.h" #include "utils/exception.h" + namespace FlexFlow { -nonnegative_int get_num_gpus(MachineSpecification const &ms) { +positive_int get_num_gpus(MachineSpecification const &ms) { return ms.num_nodes * ms.num_gpus_per_node; } -nonnegative_int get_num_cpus(MachineSpecification const &ms) { +positive_int get_num_cpus(MachineSpecification const &ms) { return ms.num_nodes * ms.num_cpus_per_node; } -nonnegative_int get_num_devices(MachineSpecification const &ms, +positive_int get_num_devices(MachineSpecification const &ms, DeviceType const &device_type) { switch (device_type) { case DeviceType::GPU: @@ -24,7 +25,7 @@ nonnegative_int get_num_devices(MachineSpecification const &ms, } } -nonnegative_int get_num_devices_per_node(MachineSpecification const &ms, +positive_int get_num_devices_per_node(MachineSpecification const &ms, DeviceType const &device_type) { switch (device_type) { case DeviceType::GPU: diff --git a/lib/pcg/src/pcg/machine_view.cc b/lib/pcg/src/pcg/machine_view.cc index 88110f914a..3afa73ca62 100644 --- a/lib/pcg/src/pcg/machine_view.cc +++ b/lib/pcg/src/pcg/machine_view.cc @@ -91,7 +91,7 @@ std::optional get_machine_space_coordinate( std::vector const &dimension_indices) { std::vector mv_strides = get_strides(machine_view); - std::vector sizes = + std::vector sizes = transform(dimension_indices, [&](nonnegative_int i) { return task.degrees.at(i.unwrap_nonnegative()) * mv_strides.at(i.unwrap_nonnegative()).unwrapped; @@ -100,13 +100,13 @@ std::optional get_machine_space_coordinate( transform(dimension_indices, [&](nonnegative_int i) { return coord.raw_coord.at(i.unwrap_nonnegative()); }); - std::vector strides = + std::vector strides = transform(dimension_indices, [&](nonnegative_int i) { return mv_strides.at(i.unwrap_nonnegative()).unwrapped; }); - std::vector coeffs = scanl( - sizes, nonnegative_int{1}, std::multiplies()); + std::vector coeffs = scanl( + sizes, 1_p, std::multiplies()); nonnegative_int index = start_idx; for (auto [coeff, coord_point, stride] : diff --git a/lib/pcg/src/pcg/operator_task_space.cc b/lib/pcg/src/pcg/operator_task_space.cc index 57af6eedc7..36ad43f3d3 100644 --- a/lib/pcg/src/pcg/operator_task_space.cc +++ b/lib/pcg/src/pcg/operator_task_space.cc @@ -23,8 +23,8 @@ std::unordered_set get_task_space_coordinates(OperatorTaskSpace const &task) { std::vector> coordinate_ranges = - transform(task.degrees, [&](nonnegative_int num_points) { - return nonnegative_range(num_points); + transform(task.degrees, [&](positive_int num_points) { + return nonnegative_range(num_points.nonnegative_int_from_positive_int()); }); std::unordered_set> raw_coordinates = @@ -45,7 +45,7 @@ nonnegative_int num_dims(OperatorTaskSpace const &task) { return num_elements(task.degrees); } -nonnegative_int num_tasks(OperatorTaskSpace const &task) { +positive_int num_tasks(OperatorTaskSpace const &task) { return product(task.degrees); } @@ -54,7 +54,7 @@ OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg, parallel_tensor_guid_t out_tensor = get_layer_outputs(pcg, layer).at(0); ParallelTensorShape shape = get_parallel_tensor_shape(pcg, out_tensor); - std::vector degrees; + std::vector degrees; extend(degrees, vector_of(ff_ordered_shard_degrees(shape))); degrees.push_back(get_sum_degree(shape)); degrees.push_back(get_discard_copy_degree(shape)); diff --git a/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc b/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc index 940024c9b6..e3caffe260 100644 --- a/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc +++ b/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc @@ -9,7 +9,7 @@ std::unordered_set ParallelTensorShape const &goal) { std::unordered_set result; - nonnegative_int sum_degree = get_sum_degree(goal); + positive_int sum_degree = get_sum_degree(goal); if (sum_degree != 1) { throw mk_runtime_error( fmt::format("generate_weight_transform currently only supports " @@ -17,7 +17,7 @@ std::unordered_set sum_degree)); } - nonnegative_int discard_copy_degree = get_discard_copy_degree(goal); + positive_int discard_copy_degree = get_discard_copy_degree(goal); if (discard_copy_degree != 1) { result.insert(ParallelOpAttrs{ReplicateAttrs{discard_copy_degree}}); } diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc index 4e72b2fe0f..f7f3cfdcfd 100644 --- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc +++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc @@ -128,15 +128,15 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::cast( parallel_tensor_guid_t ParallelComputationGraphBuilder::conv2d( parallel_tensor_guid_t const &raw_input, - nonnegative_int outChannels, - nonnegative_int kernelH, - nonnegative_int kernelW, - nonnegative_int strideH, - nonnegative_int strideW, + positive_int outChannels, + positive_int kernelH, + positive_int kernelW, + positive_int strideH, + positive_int strideW, nonnegative_int paddingH, nonnegative_int paddingW, std::optional const &activation, - nonnegative_int groups, + positive_int groups, bool use_bias, std::optional const &maybe_kernel_initializer, std::optional const &maybe_bias_initializer, @@ -176,7 +176,7 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::conv2d( parallel_tensor_guid_t ParallelComputationGraphBuilder::dense( parallel_tensor_guid_t const &input, - nonnegative_int outDim, + positive_int outDim, std::optional activation, bool use_bias, DataType data_type, @@ -209,8 +209,8 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::dense( parallel_tensor_guid_t ParallelComputationGraphBuilder::embedding( parallel_tensor_guid_t const &input, - nonnegative_int num_entries, - nonnegative_int outDim, + positive_int num_entries, + positive_int outDim, AggregateOp aggr, DataType dtype, std::optional const &maybe_kernel_initializer, @@ -238,10 +238,10 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::multihead_attention( parallel_tensor_guid_t const &query, parallel_tensor_guid_t const &key, parallel_tensor_guid_t const &value, - nonnegative_int embed_dim, - nonnegative_int num_heads, - std::optional maybe_kdim, - std::optional maybe_vdim, + positive_int embed_dim, + positive_int num_heads, + std::optional maybe_kdim, + std::optional maybe_vdim, float dropout, bool bias, bool add_bias_kv, @@ -251,8 +251,8 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::multihead_attention( std::optional maybe_output_bias_initializer, std::optional const &maybe_name) { - nonnegative_int kdim = maybe_kdim.value_or(embed_dim); - nonnegative_int vdim = maybe_vdim.value_or(embed_dim); + positive_int kdim = maybe_kdim.value_or(embed_dim); + positive_int vdim = maybe_vdim.value_or(embed_dim); MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{ /*embed_dim=*/embed_dim, @@ -409,7 +409,7 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::elu( parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_partition( parallel_tensor_guid_t const &input, ff_dim_t dim, - nonnegative_int degree, + positive_int degree, std::optional const &maybe_name) { RepartitionAttrs attrs = RepartitionAttrs{ @@ -428,7 +428,7 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_partition( parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_combine( parallel_tensor_guid_t const &input, ff_dim_t dim, - nonnegative_int degree, + positive_int degree, std::optional const &maybe_name) { CombineAttrs attrs = CombineAttrs{ @@ -446,7 +446,7 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_combine( parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_replicate( parallel_tensor_guid_t const &input, - nonnegative_int degree, + positive_int degree, std::optional const &maybe_name) { ReplicateAttrs attrs = ReplicateAttrs{degree}; @@ -461,7 +461,7 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_replicate( parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_reduce( parallel_tensor_guid_t const &input, - nonnegative_int degree, + positive_int degree, std::optional const &maybe_name) { ReductionAttrs attrs = ReductionAttrs{degree}; diff --git a/lib/pcg/test/src/pcg/computation_graph.cc b/lib/pcg/test/src/pcg/computation_graph.cc index 341801d0b0..8451545e32 100644 --- a/lib/pcg/test/src/pcg/computation_graph.cc +++ b/lib/pcg/test/src/pcg/computation_graph.cc @@ -14,9 +14,9 @@ TEST_SUITE(FF_TEST_SUITE) { ComputationGraphBuilder b; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ - 10_n, - 12_n, + TensorDims{FFOrdered{ + 10_p, + 12_p, }}, DataType::FLOAT, }; @@ -41,9 +41,9 @@ TEST_SUITE(FF_TEST_SUITE) { ComputationGraphBuilder b; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ - 10_n, - 12_n, + TensorDims{FFOrdered{ + 10_p, + 12_p, }}, DataType::FLOAT, }; @@ -67,16 +67,16 @@ TEST_SUITE(FF_TEST_SUITE) { ComputationGraphBuilder b; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ - 10_n, - 12_n, + TensorDims{FFOrdered{ + 10_p, + 12_p, }}, DataType::FLOAT, }; tensor_guid_t input = b.create_input(input_shape, CreateGrad::YES); b.dense(input, - /*outDim=*/14_n, + /*outDim=*/14_p, /*activation=*/Activation::RELU, /*use_bias=*/true, /*data_type=*/DataType::FLOAT, @@ -104,9 +104,9 @@ TEST_SUITE(FF_TEST_SUITE) { ComputationGraphBuilder b; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ - 10_n, - 12_n, + TensorDims{FFOrdered{ + 10_p, + 12_p, }}, DataType::FLOAT, }; @@ -132,9 +132,9 @@ TEST_SUITE(FF_TEST_SUITE) { ComputationGraphBuilder b; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ - 10_n, - 12_n, + TensorDims{FFOrdered{ + 10_p, + 12_p, }}, DataType::FLOAT, }; @@ -157,9 +157,9 @@ TEST_SUITE(FF_TEST_SUITE) { ComputationGraph cg = make_empty_computation_graph(); TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ - 10_n, - 12_n, + TensorDims{FFOrdered{ + 10_p, + 12_p, }}, DataType::FLOAT, }; @@ -172,7 +172,7 @@ TEST_SUITE(FF_TEST_SUITE) { }; LinearAttrs linear_attrs = LinearAttrs{ - /*out_channels=*/14_n, + /*out_channels=*/14_p, /*use_bias=*/true, /*data_type=*/DataType::FLOAT, /*activation=*/Activation::RELU, diff --git a/lib/pcg/test/src/pcg/computation_graph_builder.cc b/lib/pcg/test/src/pcg/computation_graph_builder.cc index 98a4e2a241..f7430b3403 100644 --- a/lib/pcg/test/src/pcg/computation_graph_builder.cc +++ b/lib/pcg/test/src/pcg/computation_graph_builder.cc @@ -8,20 +8,20 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("ComputationGraphBuilder") { ComputationGraphBuilder b; - nonnegative_int batch_size = 2_n; + positive_int batch_size = 2_p; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, 3_n, 10_n, 10_n}}, + TensorDims{FFOrdered{batch_size, 3_p, 10_p, 10_p}}, DataType::FLOAT, }; tensor_guid_t input = b.create_input(input_shape, CreateGrad::YES); tensor_guid_t output = b.conv2d(input, - /*outChannels=*/5_n, - /*kernelH=*/3_n, - /*kernelW=*/3_n, - /*strideH=*/1_n, - /*strideW=*/1_n, + /*outChannels=*/5_p, + /*kernelH=*/3_p, + /*kernelW=*/3_p, + /*strideH=*/1_p, + /*strideW=*/1_p, /*paddingH=*/0_n, /*paddingW=*/0_n); // ComputationGraph cg = b.computation_graph; diff --git a/lib/pcg/test/src/pcg/file_format/v1/v1_computation_graph.cc b/lib/pcg/test/src/pcg/file_format/v1/v1_computation_graph.cc index 59c606adb1..7af3f648d9 100644 --- a/lib/pcg/test/src/pcg/file_format/v1/v1_computation_graph.cc +++ b/lib/pcg/test/src/pcg/file_format/v1/v1_computation_graph.cc @@ -10,15 +10,15 @@ TEST_SUITE(FF_TEST_SUITE) { ComputationGraphBuilder b; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ - 12_n, - 16_n, + TensorDims{FFOrdered{ + 12_p, + 16_p, }}, DataType::FLOAT, }; tensor_guid_t input = b.create_input(input_shape, CreateGrad::YES); - tensor_guid_t mm_output = b.dense(input, 8_n); + tensor_guid_t mm_output = b.dense(input, 8_p); tensor_guid_t relu_output = b.relu(mm_output); return b.computation_graph; diff --git a/lib/pcg/test/src/pcg/file_format/v1/v1_parallel_computation_graph.cc b/lib/pcg/test/src/pcg/file_format/v1/v1_parallel_computation_graph.cc index 9d5dceca18..ec6a4ab006 100644 --- a/lib/pcg/test/src/pcg/file_format/v1/v1_parallel_computation_graph.cc +++ b/lib/pcg/test/src/pcg/file_format/v1/v1_parallel_computation_graph.cc @@ -11,9 +11,9 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = TensorShape{ TensorDims{ - FFOrdered{ - 12_n, - 16_n, + FFOrdered{ + 12_p, + 16_p, }, }, DataType::FLOAT, @@ -21,8 +21,8 @@ TEST_SUITE(FF_TEST_SUITE) { parallel_tensor_guid_t input = b.create_input_tensor(input_shape); parallel_tensor_guid_t t_partition = - b.parallel_partition(input, ff_dim_t{0_n}, 2_n); - parallel_tensor_guid_t mm_output = b.dense(input, 8_n); + b.parallel_partition(input, ff_dim_t{0_n}, 2_p); + parallel_tensor_guid_t mm_output = b.dense(input, 8_p); parallel_tensor_guid_t relu_output = b.relu(mm_output); return b.pcg; diff --git a/lib/pcg/test/src/pcg/machine_specification.cc b/lib/pcg/test/src/pcg/machine_specification.cc index 6d339350a0..4064f36679 100644 --- a/lib/pcg/test/src/pcg/machine_specification.cc +++ b/lib/pcg/test/src/pcg/machine_specification.cc @@ -8,9 +8,9 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("MachineSpecification") { MachineSpecification ms = MachineSpecification{ - /*num_nodes=*/4_n, - /*num_cpus_per_node=*/16_n, - /*num_gpus_per_node=*/8_n, + /*num_nodes=*/4_p, + /*num_cpus_per_node=*/16_p, + /*num_gpus_per_node=*/8_p, /*inter_node_bandwidth=*/0, /*intra_node_bandwidth=*/0, }; diff --git a/lib/pcg/test/src/pcg/machine_view.cc b/lib/pcg/test/src/pcg/machine_view.cc index e286f08bf2..ecc196a118 100644 --- a/lib/pcg/test/src/pcg/machine_view.cc +++ b/lib/pcg/test/src/pcg/machine_view.cc @@ -13,9 +13,9 @@ TEST_SUITE(FF_TEST_SUITE) { MachineView mv = MachineView{ MachineSpaceCoordinate{ /*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU}, - {MachineViewDimension{stride_t{2_n}, + {MachineViewDimension{stride_t{2_p}, MachineSpecificationDimension::INTER_NODE}, - MachineViewDimension{stride_t{2_n}, + MachineViewDimension{stride_t{2_p}, MachineSpecificationDimension::INTER_NODE}}}; SUBCASE("num_dims") { @@ -43,16 +43,16 @@ TEST_SUITE(FF_TEST_SUITE) { * Where the (x,) are the `TaskSpaceCoordinate`s, and the underlying grid * is the machine space. */ - OperatorTaskSpace task = OperatorTaskSpace{{3_n}}; + OperatorTaskSpace task = OperatorTaskSpace{{3_p}}; MachineView mv = MachineView{ MachineSpaceCoordinate{ /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU}, - {MachineViewDimension{stride_t{2_n}, + {MachineViewDimension{stride_t{2_p}, MachineSpecificationDimension::INTRA_NODE}}}; MachineSpecification ms = - MachineSpecification{/*num_nodes=*/1_n, - /*num_cpus_per_node=*/6_n, - /*num_gpus_per_node=*/6_n, + MachineSpecification{/*num_nodes=*/1_p, + /*num_cpus_per_node=*/6_p, + /*num_gpus_per_node=*/6_p, /*inter_node_bandwidth=*/0, /*intra_node_bandwidth=*/0}; @@ -112,18 +112,18 @@ TEST_SUITE(FF_TEST_SUITE) { * grid is the machine space. */ - OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n}}; + OperatorTaskSpace task = OperatorTaskSpace{{2_p, 2_p}}; MachineView mv = MachineView{ MachineSpaceCoordinate{ /*node_idx=*/1_n, /*device_idx=*/2_n, DeviceType::GPU}, - {MachineViewDimension{stride_t{1_n}, + {MachineViewDimension{stride_t{1_p}, MachineSpecificationDimension::INTER_NODE}, - MachineViewDimension{stride_t{2_n}, + MachineViewDimension{stride_t{2_p}, MachineSpecificationDimension::INTRA_NODE}}}; MachineSpecification ms = - MachineSpecification{/*num_nodes=*/3_n, - /*num_cpus_per_node=*/5_n, - /*num_gpus_per_node=*/5_n, + MachineSpecification{/*num_nodes=*/3_p, + /*num_cpus_per_node=*/5_p, + /*num_gpus_per_node=*/5_p, /*inter_node_bandwidth=*/0, /*intra_node_bandwidth=*/0}; @@ -179,18 +179,18 @@ TEST_SUITE(FF_TEST_SUITE) { * grid is the machine space. */ - OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n}}; + OperatorTaskSpace task = OperatorTaskSpace{{2_p, 2_p}}; MachineView mv = MachineView{ MachineSpaceCoordinate{ /*node_idx=*/1_n, /*device_idx=*/0_n, DeviceType::GPU}, - {MachineViewDimension{stride_t{1_n}, + {MachineViewDimension{stride_t{1_p}, MachineSpecificationDimension::INTRA_NODE}, - MachineViewDimension{stride_t{2_n}, + MachineViewDimension{stride_t{2_p}, MachineSpecificationDimension::INTRA_NODE}}}; MachineSpecification ms = - MachineSpecification{/*num_nodes=*/2_n, - /*num_cpus_per_node=*/6_n, - /*num_gpus_per_node=*/6_n, + MachineSpecification{/*num_nodes=*/2_p, + /*num_cpus_per_node=*/6_p, + /*num_gpus_per_node=*/6_p, /*inter_node_bandwidth=*/0, /*intra_node_bandwidth=*/0}; @@ -253,20 +253,20 @@ TEST_SUITE(FF_TEST_SUITE) { * grid is the machine space. */ - OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n, 2_n}}; + OperatorTaskSpace task = OperatorTaskSpace{{2_p, 2_p, 2_p}}; MachineView mv = MachineView{ MachineSpaceCoordinate{ /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU}, - {MachineViewDimension{stride_t{1_n}, + {MachineViewDimension{stride_t{1_p}, MachineSpecificationDimension::INTER_NODE}, - MachineViewDimension{stride_t{2_n}, + MachineViewDimension{stride_t{2_p}, MachineSpecificationDimension::INTRA_NODE}, - MachineViewDimension{stride_t{1_n}, + MachineViewDimension{stride_t{1_p}, MachineSpecificationDimension::INTRA_NODE}}}; MachineSpecification ms = - MachineSpecification{/*num_nodes=*/2_n, - /*num_cpus_per_node=*/8_n, - /*num_gpus_per_node=*/8_n, + MachineSpecification{/*num_nodes=*/2_p, + /*num_cpus_per_node=*/8_p, + /*num_gpus_per_node=*/8_p, /*inter_node_bandwidth=*/0, /*intra_node_bandwidth=*/0}; @@ -319,17 +319,17 @@ TEST_SUITE(FF_TEST_SUITE) { * select */ MachineSpecification ms = - MachineSpecification{/*num_nodes=*/1_n, - /*num_cpus_per_node=*/6_n, - /*num_gpus_per_node=*/6_n, + MachineSpecification{/*num_nodes=*/1_p, + /*num_cpus_per_node=*/6_p, + /*num_gpus_per_node=*/6_p, /*inter_node_bandwidth=*/0, /*intra_node_bandwidth=*/0}; - OperatorTaskSpace task = OperatorTaskSpace{{3_n}}; + OperatorTaskSpace task = OperatorTaskSpace{{3_p}}; MachineView mv = MachineView{ MachineSpaceCoordinate{ /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU}, - {MachineViewDimension{stride_t{2_n}, + {MachineViewDimension{stride_t{2_p}, MachineSpecificationDimension::INTRA_NODE}}}; std::unordered_set correct = { @@ -364,19 +364,19 @@ TEST_SUITE(FF_TEST_SUITE) { */ MachineSpecification ms = - MachineSpecification{/*num_nodes=*/3_n, - /*num_cpus_per_node=*/5_n, - /*num_gpus_per_node=*/5_n, + MachineSpecification{/*num_nodes=*/3_p, + /*num_cpus_per_node=*/5_p, + /*num_gpus_per_node=*/5_p, /*inter_node_bandwidth=*/0, /*intra_node_bandwidth=*/0}; - OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n}}; + OperatorTaskSpace task = OperatorTaskSpace{{2_p, 2_p}}; MachineView mv = MachineView{ MachineSpaceCoordinate{ /*node_idx=*/1_n, /*device_idx=*/2_n, DeviceType::GPU}, - {MachineViewDimension{stride_t{1_n}, + {MachineViewDimension{stride_t{1_p}, MachineSpecificationDimension::INTER_NODE}, - MachineViewDimension{stride_t{2_n}, + MachineViewDimension{stride_t{2_p}, MachineSpecificationDimension::INTRA_NODE}}}; std::unordered_set correct = { diff --git a/lib/pcg/test/src/pcg/operator_task_space.cc b/lib/pcg/test/src/pcg/operator_task_space.cc index fa06af3635..4b01ed02fb 100644 --- a/lib/pcg/test/src/pcg/operator_task_space.cc +++ b/lib/pcg/test/src/pcg/operator_task_space.cc @@ -18,7 +18,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("OperatorTaskSpace has 2 dimensions") { - OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n}}; + OperatorTaskSpace task = OperatorTaskSpace{{2_p, 2_p}}; std::unordered_set correct = {{ TaskSpaceCoordinate{{0_n, 0_n}}, @@ -32,7 +32,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("OperatorTaskSpace has 3 dimensions") { - OperatorTaskSpace task = OperatorTaskSpace{{1_n, 2_n, 2_n}}; + OperatorTaskSpace task = OperatorTaskSpace{{1_p, 2_p, 2_p}}; std::unordered_set correct = {{ TaskSpaceCoordinate{{0_n, 0_n, 0_n}}, @@ -48,7 +48,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_task_space_maximum_coordinate") { SUBCASE("OperatorTaskSpace has 2 dimensions") { - OperatorTaskSpace task = OperatorTaskSpace{{3_n, 2_n}}; + OperatorTaskSpace task = OperatorTaskSpace{{3_p, 2_p}}; TaskSpaceCoordinate correct = TaskSpaceCoordinate{{2_n, 1_n}}; TaskSpaceCoordinate result = get_task_space_maximum_coordinate(task); @@ -56,7 +56,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("OperatorTaskSpace has 3 dimensions") { - OperatorTaskSpace task = OperatorTaskSpace{{3_n, 2_n, 4_n}}; + OperatorTaskSpace task = OperatorTaskSpace{{3_p, 2_p, 4_p}}; TaskSpaceCoordinate correct = TaskSpaceCoordinate{{2_n, 1_n, 3_n}}; TaskSpaceCoordinate result = get_task_space_maximum_coordinate(task); diff --git a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc index d68e20bd92..f223558868 100644 --- a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc +++ b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc @@ -28,9 +28,9 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = TensorShape{ TensorDims{ - FFOrdered{ - 12_n, - 16_n, + FFOrdered{ + 12_p, + 16_p, }, }, DataType::FLOAT, @@ -64,7 +64,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = TensorShape{ TensorDims{ - FFOrdered{10_n, 12_n}, + FFOrdered{10_p, 12_p}, }, DataType::FLOAT, }; @@ -84,7 +84,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::string my_op_name = "my op"; LinearAttrs linear_attrs = LinearAttrs{ - /*out_channels=*/14_n, + /*out_channels=*/14_p, /*use_bias=*/true, /*data_type=*/DataType::FLOAT, /*activation=*/Activation::RELU, @@ -134,9 +134,9 @@ TEST_SUITE(FF_TEST_SUITE) { "get_source_layer(ParallelComputationGraph, parallel_tensor_guid_t)") { TensorShape input_shape = TensorShape{ TensorDims{ - FFOrdered{ - 10_n, - 12_n, + FFOrdered{ + 10_p, + 12_p, }, }, DataType::FLOAT, @@ -205,9 +205,9 @@ TEST_SUITE(FF_TEST_SUITE) { "get_incoming_weights(ParallelComputationGraph, parallel_layer_guid_t)") { TensorShape input_shape = TensorShape{ TensorDims{ - FFOrdered{ - 10_n, - 12_n, + FFOrdered{ + 10_p, + 12_p, }, }, DataType::FLOAT, @@ -248,7 +248,7 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelComputationGraph pcg = empty_parallel_computation_graph(); LinearAttrs linear_attrs = LinearAttrs{ - /*out_channels=*/14_n, + /*out_channels=*/14_p, /*use_bias=*/false, /*data_type=*/DataType::FLOAT, /*activation=*/Activation::RELU, @@ -261,7 +261,7 @@ TEST_SUITE(FF_TEST_SUITE) { RepartitionAttrs partition_input_attrs = RepartitionAttrs{ /*repartition_dim=*/ff_dim_t{0_n}, - /*repartition_degree=*/2_n, + /*repartition_degree=*/2_p, }; ParallelLayerAddedResult partition_input_added = add_parallel_layer( @@ -281,7 +281,7 @@ TEST_SUITE(FF_TEST_SUITE) { get_only(projection_weight_added.outputs); ReplicateAttrs replicate_projection_attrs = ReplicateAttrs{ - /*replicate_degree=*/2_n, + /*replicate_degree=*/2_p, }; ParallelLayerAddedResult replicate_projection_added = add_parallel_layer(pcg, @@ -309,9 +309,9 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("pcg_add_input_layer") { TensorShape input_shape = TensorShape{ TensorDims{ - FFOrdered{ - 12_n, - 10_n, + FFOrdered{ + 12_p, + 10_p, }, }, DataType::FLOAT, diff --git a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc index b82cb009a9..1682ac6254 100644 --- a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc +++ b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc @@ -26,14 +26,14 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("ParallelComputationGraphBuilder::add") { ParallelComputationGraphBuilder b; - ShardParallelDim d1 = ShardParallelDim{10_n, 2_n}; - ShardParallelDim d2 = ShardParallelDim{15_n, 3_n}; + ShardParallelDim d1 = ShardParallelDim{10_p, 2_p}; + ShardParallelDim d2 = ShardParallelDim{15_p, 3_p}; TensorShape lhs_shape = TensorShape{ TensorDims{ - FFOrdered{ - 10_n, - 15_n, + FFOrdered{ + 10_p, + 15_p, }, }, DataType::FLOAT, @@ -42,12 +42,12 @@ TEST_SUITE(FF_TEST_SUITE) { // ParallelTensorShape lhs_shape = ParallelTensorShape{ // ParallelTensorDims{ // FFOrdered{ - // ShardParallelDim{10_n, 2_n}, - // ShardParallelDim{15_n, 3_n}, + // ShardParallelDim{10_p, 2_p}, + // ShardParallelDim{15_p, 3_p}, // }, // ReplicaParallelDimSet{ - // SumDegree{2_n}, - // DiscardCopyDegree{1_n}, + // SumDegree{2_p}, + // DiscardCopyDegree{1_p}, // }, // }, // DataType::FLOAT, @@ -88,10 +88,10 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape a_shape = TensorShape{ TensorDims{ - FFOrdered{ - 4_n, - 10_n, - 15_n, + FFOrdered{ + 4_p, + 10_p, + 15_p, }, }, DataType::FLOAT, @@ -99,10 +99,10 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape b_shape = TensorShape{ TensorDims{ - FFOrdered{ - 4_n, - 15_n, - 10_n, + FFOrdered{ + 4_p, + 15_p, + 10_p, }, }, DataType::FLOAT, @@ -141,9 +141,9 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = TensorShape{ TensorDims{ - FFOrdered{ - 10_n, - 12_n, + FFOrdered{ + 10_p, + 12_p, }, }, DataType::FLOAT, @@ -176,24 +176,24 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("ParallelComputationGraphBuilder::conv2d") { ParallelComputationGraphBuilder b; - nonnegative_int batch_size = 2_n; + positive_int batch_size = 2_p; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, 3_n, 10_n, 10_n}}, + TensorDims{FFOrdered{batch_size, 3_p, 10_p, 10_p}}, DataType::FLOAT, }; parallel_tensor_guid_t input = b.create_input_tensor(input_shape); parallel_tensor_guid_t par_input = - b.parallel_partition(input, ff_dim_t{0_n}, 2_n); + b.parallel_partition(input, ff_dim_t{0_n}, 2_p); ParallelTensorShape par_input_shape = b.get_shape(par_input); - nonnegative_int outChannels = 6_n; - nonnegative_int kernelH = 5_n; - nonnegative_int kernelW = 4_n; - nonnegative_int strideH = 3_n; - nonnegative_int strideW = 2_n; + positive_int outChannels = 6_p; + positive_int kernelH = 5_p; + positive_int kernelW = 4_p; + positive_int strideH = 3_p; + positive_int strideW = 2_p; nonnegative_int paddingH = 1_n; nonnegative_int paddingW = 0_n; parallel_tensor_guid_t output = b.conv2d(par_input, @@ -252,7 +252,7 @@ TEST_SUITE(FF_TEST_SUITE) { strideW, paddingH, paddingW, - /*groups=*/1_n, + /*groups=*/1_p, /*activation=*/std::nullopt, /*use_bias=*/true, }; @@ -298,14 +298,14 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = TensorShape{ TensorDims{ - FFOrdered{ - 10_n, - 16_n, + FFOrdered{ + 10_p, + 16_p, }, }, DataType::FLOAT, }; - nonnegative_int outDim = 14_n; + positive_int outDim = 14_p; parallel_tensor_guid_t input = b.create_input_tensor(input_shape); parallel_tensor_guid_t output = b.dense(input, @@ -336,9 +336,9 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = TensorShape{ TensorDims{ - FFOrdered{ - 12_n, - 10_n, + FFOrdered{ + 12_p, + 10_p, }, }, DataType::INT32, @@ -346,8 +346,8 @@ TEST_SUITE(FF_TEST_SUITE) { parallel_tensor_guid_t input = b.create_input_tensor(input_shape); parallel_tensor_guid_t output = b.embedding(input, - /*num_entries=*/32_n, - /*outDim=*/8_n, + /*num_entries=*/32_p, + /*outDim=*/8_p, AggregateOp::SUM, DataType::FLOAT); parallel_layer_guid_t layer = get_source_layer(output); @@ -373,10 +373,10 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape query_shape = TensorShape{ TensorDims{ - FFOrdered{ - 12_n, - 16_n, - 10_n, + FFOrdered{ + 12_p, + 16_p, + 10_p, }, }, DataType::FLOAT, @@ -385,8 +385,8 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape key_shape = query_shape; TensorShape value_shape = query_shape; - nonnegative_int embed_dim = 8_n; - nonnegative_int num_heads = 6_n; + positive_int embed_dim = 8_p; + positive_int num_heads = 6_p; parallel_tensor_guid_t query = b.create_input_tensor(query_shape); parallel_tensor_guid_t key = b.create_input_tensor(key_shape); @@ -417,9 +417,9 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = TensorShape{ TensorDims{ - FFOrdered{ - 18_n, - 32_n, + FFOrdered{ + 18_p, + 32_p, }, }, DataType::FLOAT, @@ -447,14 +447,14 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("ParallelComputationGraphBuilder::parallel_partition") { ParallelComputationGraphBuilder b; - ShardParallelDim batch_dim = ShardParallelDim{18_n, 2_n}; - ShardParallelDim feature_dim = ShardParallelDim{10_n, 1_n}; + ShardParallelDim batch_dim = ShardParallelDim{18_p, 2_p}; + ShardParallelDim feature_dim = ShardParallelDim{10_p, 1_p}; TensorShape input_shape = TensorShape{ TensorDims{ - FFOrdered{ - 18_n, - 10_n, + FFOrdered{ + 18_p, + 10_p, }, }, DataType::FLOAT, @@ -462,7 +462,7 @@ TEST_SUITE(FF_TEST_SUITE) { parallel_tensor_guid_t input = b.create_input_tensor(input_shape); parallel_tensor_guid_t output = - b.parallel_partition(input, ff_dim_t{nonnegative_int{0}}, 2_n); + b.parallel_partition(input, ff_dim_t{0_n}, 2_p); parallel_layer_guid_t layer = get_source_layer(output); SUBCASE("incoming") { @@ -485,18 +485,18 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = TensorShape{ TensorDims{ - FFOrdered{ - 18_n, - 10_n, + FFOrdered{ + 18_p, + 10_p, }, }, DataType::FLOAT, }; parallel_tensor_guid_t input = b.create_input_tensor(input_shape); - input = b.parallel_partition(input, ff_dim_t{0_n}, 2_n); + input = b.parallel_partition(input, ff_dim_t{0_n}, 2_p); parallel_tensor_guid_t output = - b.parallel_combine(input, ff_dim_t{0_n}, 2_n); + b.parallel_combine(input, ff_dim_t{0_n}, 2_p); parallel_layer_guid_t layer = get_source_layer(output); SUBCASE("incoming") { @@ -519,16 +519,16 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = TensorShape{ TensorDims{ - FFOrdered{ - 18_n, - 10_n, + FFOrdered{ + 18_p, + 10_p, }, }, DataType::FLOAT, }; parallel_tensor_guid_t input = b.create_input_tensor(input_shape); - parallel_tensor_guid_t output = b.parallel_replicate(input, 2_n); + parallel_tensor_guid_t output = b.parallel_replicate(input, 2_p); parallel_layer_guid_t layer = get_source_layer(output); SUBCASE("incoming") { @@ -551,21 +551,21 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = TensorShape{ TensorDims{ - FFOrdered{ - 18_n, - 10_n, + FFOrdered{ + 18_p, + 10_p, }, }, DataType::FLOAT, }; parallel_tensor_guid_t input = b.create_input_tensor(input_shape); - input = b.parallel_partition(input, ff_dim_t{1_n}, 2_n); + input = b.parallel_partition(input, ff_dim_t{1_n}, 2_p); input = b.dense(input, - /*out_dim=*/12_n, + /*out_dim=*/12_p, /*activation=*/std::nullopt, /*use_bias=*/false); - parallel_tensor_guid_t output = b.parallel_reduce(input, 2_n); + parallel_tensor_guid_t output = b.parallel_reduce(input, 2_p); parallel_layer_guid_t layer = get_source_layer(output); SUBCASE("incoming") { diff --git a/lib/pcg/test/src/pcg/pcg_from_computation_graph.cc b/lib/pcg/test/src/pcg/pcg_from_computation_graph.cc index 5a8f5fcd19..d037d64672 100644 --- a/lib/pcg/test/src/pcg/pcg_from_computation_graph.cc +++ b/lib/pcg/test/src/pcg/pcg_from_computation_graph.cc @@ -14,16 +14,16 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = TensorShape{ TensorDims{ - FFOrdered{ - 10_n, - 12_n, + FFOrdered{ + 10_p, + 12_p, }, }, DataType::FLOAT, }; LinearAttrs linear_attrs = LinearAttrs{ - /*out_channels=*/8_n, + /*out_channels=*/8_p, /*use_bias=*/true, /*data_type=*/DataType::FLOAT, /*activation=*/Activation::RELU, diff --git a/lib/pcg/test/src/pcg/start_invariant_machine_view.cc b/lib/pcg/test/src/pcg/start_invariant_machine_view.cc index 71c4d1b1d0..afd6ad6b33 100644 --- a/lib/pcg/test/src/pcg/start_invariant_machine_view.cc +++ b/lib/pcg/test/src/pcg/start_invariant_machine_view.cc @@ -8,9 +8,9 @@ using namespace FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("StartInvariantMachineView - utility functions") { StartInvariantMachineView simv = StartInvariantMachineView{ - {MachineViewDimension{stride_t{2_n}, + {MachineViewDimension{stride_t{2_p}, MachineSpecificationDimension::INTER_NODE}, - MachineViewDimension{stride_t{2_n}, + MachineViewDimension{stride_t{2_p}, MachineSpecificationDimension::INTER_NODE}}, DeviceType::GPU}; @@ -28,7 +28,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("get_strides") { std::vector result = get_strides(simv); - std::vector correct = {stride_t{2_n}, stride_t{2_n}}; + std::vector correct = {stride_t{2_p}, stride_t{2_p}}; CHECK(result == correct); } @@ -45,9 +45,9 @@ TEST_SUITE(FF_TEST_SUITE) { MachineSpaceCoordinate start = MachineSpaceCoordinate{1_n, 2_n, DeviceType::GPU}; std::vector dimensions = { - MachineViewDimension{stride_t{2_n}, + MachineViewDimension{stride_t{2_p}, MachineSpecificationDimension::INTER_NODE}, - MachineViewDimension{stride_t{3_n}, + MachineViewDimension{stride_t{3_p}, MachineSpecificationDimension::INTRA_NODE}}; MachineView mv = MachineView{start, dimensions}; @@ -94,15 +94,15 @@ TEST_SUITE(FF_TEST_SUITE) { * | (0,) | | (1,) | | (2,) | | * +-------+-------+-------+-------+-------+-------+ */ - OperatorTaskSpace task = OperatorTaskSpace{{3_n}}; + OperatorTaskSpace task = OperatorTaskSpace{{3_p}}; StartInvariantMachineView simv = StartInvariantMachineView{ - {MachineViewDimension{stride_t{2_n}, + {MachineViewDimension{stride_t{2_p}, MachineSpecificationDimension::INTRA_NODE}}, DeviceType::GPU}; MachineSpecification ms = - MachineSpecification{/*num_nodes=*/1_n, - /*num_cpus_per_node=*/6_n, - /*num_gpus_per_node=*/6_n, + MachineSpecification{/*num_nodes=*/1_p, + /*num_cpus_per_node=*/6_p, + /*num_gpus_per_node=*/6_p, /*inter_node_bandwidth=*/0.0, /*intra_node_bandwidth=*/0.0}; @@ -162,17 +162,17 @@ TEST_SUITE(FF_TEST_SUITE) { * +-------+-------+-------+-------+ */ - OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n}}; + OperatorTaskSpace task = OperatorTaskSpace{{2_p, 2_p}}; StartInvariantMachineView simv = StartInvariantMachineView{ - {MachineViewDimension{stride_t{1_n}, + {MachineViewDimension{stride_t{1_p}, MachineSpecificationDimension::INTER_NODE}, - MachineViewDimension{stride_t{2_n}, + MachineViewDimension{stride_t{2_p}, MachineSpecificationDimension::INTRA_NODE}}, DeviceType::GPU}; MachineSpecification ms = - MachineSpecification{/*num_nodes=*/2_n, - /*num_cpus_per_node=*/4_n, - /*num_gpus_per_node=*/4_n, + MachineSpecification{/*num_nodes=*/2_p, + /*num_cpus_per_node=*/4_p, + /*num_gpus_per_node=*/4_p, /*inter_node_bandwidth=*/0, /*intra_node_bandwidth=*/0}; diff --git a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_value.variant.toml b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_value.variant.toml index 3312b292a0..1994d54f38 100644 --- a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_value.variant.toml +++ b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_value.variant.toml @@ -21,6 +21,7 @@ includes = [ "op-attrs/datatype.dtg.h", "", "utils/nonnegative_int/nonnegative_int.h", + "utils/positive_int/positive_int.h", ] src_includes = [ @@ -33,6 +34,9 @@ src_includes = [ [[values]] type = "::FlexFlow::nonnegative_int" +[[values]] +type = "::FlexFlow::positive_int" + [[values]] type = "bool" @@ -43,7 +47,7 @@ type = "float" type = "std::optional" [[values]] -type = "std::vector<::FlexFlow::nonnegative_int>" +type = "std::vector<::FlexFlow::positive_int>" [[values]] type = "std::vector<::FlexFlow::ff_dim_t>" diff --git a/lib/substitutions/src/substitutions/output_graph/materialize_operator_from_attrs_map.cc b/lib/substitutions/src/substitutions/output_graph/materialize_operator_from_attrs_map.cc index 4f11b343f8..1568b73162 100644 --- a/lib/substitutions/src/substitutions/output_graph/materialize_operator_from_attrs_map.cc +++ b/lib/substitutions/src/substitutions/output_graph/materialize_operator_from_attrs_map.cc @@ -33,12 +33,12 @@ PCGOperatorAttrs materialize_operator_from_attrs_map( switch (op_type) { case OperatorType::MULTIHEAD_ATTENTION: return PCGOperatorAttrs{MultiHeadAttentionAttrs{ - /*embed_dim=*/acc.get( + /*embed_dim=*/acc.get( OperatorAttributeKey::EMBED_DIM), /*num_heads=*/ - acc.get(OperatorAttributeKey::NUM_HEADS), - /*kdim=*/acc.get(OperatorAttributeKey::KDIM), - /*vdim=*/acc.get(OperatorAttributeKey::VDIM), + acc.get(OperatorAttributeKey::NUM_HEADS), + /*kdim=*/acc.get(OperatorAttributeKey::KDIM), + /*vdim=*/acc.get(OperatorAttributeKey::VDIM), /*dropout=*/acc.get(OperatorAttributeKey::DROPOUT), /*bias=*/acc.get(OperatorAttributeKey::BIAS), /*add_bias_kv=*/acc.get(OperatorAttributeKey::ADD_BIAS_KV), @@ -46,10 +46,10 @@ PCGOperatorAttrs materialize_operator_from_attrs_map( }}; case OperatorType::POOL2D: return PCGOperatorAttrs{Pool2DAttrs{ - /*kernel_h=*/acc.get(OperatorAttributeKey::KERNEL_H), - /*kernel_w=*/acc.get(OperatorAttributeKey::KERNEL_W), - /*stride_h=*/acc.get(OperatorAttributeKey::STRIDE_H), - /*stride_w=*/acc.get(OperatorAttributeKey::STRIDE_W), + /*kernel_h=*/acc.get(OperatorAttributeKey::KERNEL_H), + /*kernel_w=*/acc.get(OperatorAttributeKey::KERNEL_W), + /*stride_h=*/acc.get(OperatorAttributeKey::STRIDE_H), + /*stride_w=*/acc.get(OperatorAttributeKey::STRIDE_W), /*padding_h=*/ acc.get(OperatorAttributeKey::PADDING_H), /*padding_w=*/ @@ -66,7 +66,7 @@ PCGOperatorAttrs materialize_operator_from_attrs_map( case OperatorType::DROPOUT: case OperatorType::LINEAR: return PCGOperatorAttrs{LinearAttrs{ - /*out_channels=*/acc.get( + /*out_channels=*/acc.get( OperatorAttributeKey::OUT_CHANNELS), /*use_bias=*/acc.get(OperatorAttributeKey::USE_BIAS), /*data_type=*/acc.get(OperatorAttributeKey::DATA_TYPE), diff --git a/lib/substitutions/test/src/substitutions/apply_substitution/apply_substitution.cc b/lib/substitutions/test/src/substitutions/apply_substitution/apply_substitution.cc index ad78695fbb..05fd1a3fc9 100644 --- a/lib/substitutions/test/src/substitutions/apply_substitution/apply_substitution.cc +++ b/lib/substitutions/test/src/substitutions/apply_substitution/apply_substitution.cc @@ -62,15 +62,15 @@ TEST_SUITE(FF_TEST_SUITE) { Substitution sub = b.get_substitution(); - nonnegative_int in_channels = 24_n; - nonnegative_int batch_size = 4_n; - nonnegative_int batch_degree = 2_n; + positive_int in_channels = 24_p; + positive_int batch_size = 4_p; + positive_int batch_degree = 2_p; std::string mm_match = "mm_match"; std::string relu_match = "relu_match"; TensorShape input_shape = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ batch_size, in_channels, }, @@ -84,11 +84,11 @@ TEST_SUITE(FF_TEST_SUITE) { parallel_tensor_guid_t t = b.create_input_tensor(input_shape); t = b.parallel_partition(t, ff_dim_t{0_n}, batch_degree); t = b.dense(t, - /*outDim=*/16_n, + /*outDim=*/16_p, /*activation=*/std::nullopt); t = b.gelu(t); t = b.dense(t, - /*outDim=*/12_n, + /*outDim=*/12_p, /*activation=*/std::nullopt, /*use_bias=*/false, /*data_type=*/DataType::FLOAT, @@ -98,7 +98,7 @@ TEST_SUITE(FF_TEST_SUITE) { t = b.relu(t, /*name=*/relu_match); t = b.dense(t, - /*outDim=*/8_n, + /*outDim=*/8_p, /*activation=*/Activation::RELU); return sub_pcg_from_full_pcg(b.pcg); @@ -138,11 +138,11 @@ TEST_SUITE(FF_TEST_SUITE) { parallel_tensor_guid_t t = b.create_input_tensor(input_shape); t = b.parallel_partition(t, ff_dim_t{0_n}, batch_degree); t = b.dense(t, - /*outDim=*/16_n, + /*outDim=*/16_p, /*activation=*/std::nullopt); t = b.gelu(t); t = b.dense(t, - /*outDim=*/12_n, + /*outDim=*/12_p, /*activation=*/Activation::RELU, /*use_bias=*/false, /*data_type=*/DataType::FLOAT, @@ -150,7 +150,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*bias_initializer=*/std::nullopt, /*name=*/std::nullopt); t = b.dense(t, - /*outDim=*/8_n, + /*outDim=*/8_p, /*activation=*/Activation::RELU); return sub_pcg_from_full_pcg(b.pcg); diff --git a/lib/substitutions/test/src/substitutions/apply_substitution/evaluate_substitution_output.cc b/lib/substitutions/test/src/substitutions/apply_substitution/evaluate_substitution_output.cc index 75bbbcae9e..7419c62965 100644 --- a/lib/substitutions/test/src/substitutions/apply_substitution/evaluate_substitution_output.cc +++ b/lib/substitutions/test/src/substitutions/apply_substitution/evaluate_substitution_output.cc @@ -111,15 +111,15 @@ TEST_SUITE(FF_TEST_SUITE) { }, }; - nonnegative_int in_channels = 24_n; - nonnegative_int batch_size = 4_n; - nonnegative_int batch_degree = 2_n; + positive_int in_channels = 24_p; + positive_int batch_size = 4_p; + positive_int batch_degree = 2_p; std::string mm_match = "mm_match"; std::string relu_match = "relu_match"; TensorShape input_shape = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ batch_size, in_channels, }, @@ -133,11 +133,11 @@ TEST_SUITE(FF_TEST_SUITE) { parallel_tensor_guid_t t = b.create_input_tensor(input_shape); t = b.parallel_partition(t, ff_dim_t{0_n}, batch_degree); t = b.dense(t, - /*outDim=*/16_n, + /*outDim=*/16_p, /*activation=*/std::nullopt); t = b.gelu(t); t = b.dense(t, - /*outDim=*/12_n, + /*outDim=*/12_p, /*activation=*/std::nullopt, /*use_bias=*/false, /*data_type=*/DataType::FLOAT, @@ -147,7 +147,7 @@ TEST_SUITE(FF_TEST_SUITE) { t = b.relu(t, /*name=*/relu_match); t = b.dense(t, - /*outDim=*/8_n, + /*outDim=*/8_p, /*activation=*/Activation::RELU); return sub_pcg_from_full_pcg(b.pcg); @@ -189,7 +189,7 @@ TEST_SUITE(FF_TEST_SUITE) { result_input_map = result.second.input_mapping; LinearAttrs correct_result_fused_mm_relu_attrs = LinearAttrs{ - /*out_channels=*/12_n, + /*out_channels=*/12_p, /*use_bias=*/false, /*data_type=*/DataType::FLOAT, /*activation=*/Activation::RELU, diff --git a/lib/substitutions/test/src/substitutions/apply_substitution/perform_shape_inference.cc b/lib/substitutions/test/src/substitutions/apply_substitution/perform_shape_inference.cc index 9b8e526c08..2bf72d3224 100644 --- a/lib/substitutions/test/src/substitutions/apply_substitution/perform_shape_inference.cc +++ b/lib/substitutions/test/src/substitutions/apply_substitution/perform_shape_inference.cc @@ -18,21 +18,21 @@ TEST_SUITE(FF_TEST_SUITE) { UnorderedSetLabelledOpenDataflowGraph>(); - nonnegative_int in_channels = 24_n; - nonnegative_int out_channels = 16_n; - nonnegative_int batch_size = 4_n; - nonnegative_int batch_degree = 2_n; + positive_int in_channels = 24_p; + positive_int out_channels = 16_p; + positive_int batch_size = 4_p; + positive_int batch_degree = 2_p; DataflowGraphInput i0 = g.add_input({}); ParallelTensorShape i0_shape = ParallelTensorShape{ ParallelTensorDims{ FFOrdered{ ShardParallelDim{batch_size, batch_degree}, - ShardParallelDim{in_channels, 1_n}, + ShardParallelDim{in_channels, 1_p}, }, ReplicaParallelDimSet{ - SumDegree{1_n}, - DiscardCopyDegree{1_n}, + SumDegree{1_p}, + DiscardCopyDegree{1_p}, }, }, DataType::FLOAT, diff --git a/lib/substitutions/test/src/substitutions/operator_pattern/get_attribute.cc b/lib/substitutions/test/src/substitutions/operator_pattern/get_attribute.cc index 24f9e9bd56..5dcfda0ca7 100644 --- a/lib/substitutions/test/src/substitutions/operator_pattern/get_attribute.cc +++ b/lib/substitutions/test/src/substitutions/operator_pattern/get_attribute.cc @@ -6,7 +6,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_attribute(LinearAttrs, OperatorAttributeKey)") { - nonnegative_int out_channels = 16_n; + positive_int out_channels = 16_p; bool use_bias = true; std::optional activation = Activation::GELU; std::optional regularizer = RegularizerAttrs{ diff --git a/lib/substitutions/test/src/substitutions/pcg_pattern.cc b/lib/substitutions/test/src/substitutions/pcg_pattern.cc index 4dbf0885dd..f4d430077f 100644 --- a/lib/substitutions/test/src/substitutions/pcg_pattern.cc +++ b/lib/substitutions/test/src/substitutions/pcg_pattern.cc @@ -16,13 +16,13 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("simple case") { ParallelComputationGraphBuilder builder; - nonnegative_int batch_size = 16_n; - nonnegative_int batch_degree = 2_n; - nonnegative_int num_channels = 24_n; + positive_int batch_size = 16_p; + positive_int batch_degree = 2_p; + positive_int num_channels = 24_p; TensorShape a_shape = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ batch_size, num_channels, }, @@ -36,7 +36,7 @@ TEST_SUITE(FF_TEST_SUITE) { a_tensor = builder.parallel_partition(a_tensor, ff_dim_t{0_n}, batch_degree); - nonnegative_int outDim = 16_n; + positive_int outDim = 16_p; std::string x_matmul_name = "x_matmul"; std::string y_matmul_name = "y_matmul"; parallel_tensor_guid_t t0 = @@ -159,13 +159,13 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("pcg is a chain") { ParallelComputationGraphBuilder builder; - nonnegative_int batch_size = 16_n; - nonnegative_int batch_degree = 2_n; - nonnegative_int num_channels = 24_n; + positive_int batch_size = 16_p; + positive_int batch_degree = 2_p; + positive_int num_channels = 24_p; TensorShape a_shape = TensorShape{ TensorDims{ - FFOrdered{ + FFOrdered{ batch_size, num_channels, }, @@ -179,7 +179,7 @@ TEST_SUITE(FF_TEST_SUITE) { a_tensor = builder.parallel_partition(a_tensor, ff_dim_t{0_n}, batch_degree); - nonnegative_int outDim = 16_n; + positive_int outDim = 16_p; std::string x_matmul_name = "x_matmul"; std::string y_matmul_name = "y_matmul"; parallel_tensor_guid_t t0 = diff --git a/lib/substitutions/test/src/substitutions/unity_substitution_set.cc b/lib/substitutions/test/src/substitutions/unity_substitution_set.cc index 804fa99bef..c86cb7e51f 100644 --- a/lib/substitutions/test/src/substitutions/unity_substitution_set.cc +++ b/lib/substitutions/test/src/substitutions/unity_substitution_set.cc @@ -6,9 +6,9 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_substitution_set") { MachineSpecification machine_spec = MachineSpecification{ - /*num_nodes=*/2_n, - /*num_cpus_per_node=*/8_n, - /*num_gpus_per_node=*/4_n, + /*num_nodes=*/2_p, + /*num_cpus_per_node=*/8_p, + /*num_gpus_per_node=*/4_p, /*inter_node_bandwidth=*/0.0, /*intra_node_bandwidth=*/0.0, }; diff --git a/lib/task-spec/CMakeLists.txt b/lib/task-spec/CMakeLists.txt index 8deb20a593..8ccd8312cb 100644 --- a/lib/task-spec/CMakeLists.txt +++ b/lib/task-spec/CMakeLists.txt @@ -14,3 +14,5 @@ ff_add_library( pcg spdlog ) + +add_subdirectory(test) diff --git a/lib/task-spec/src/task-spec/ops/attention.cc b/lib/task-spec/src/task-spec/ops/attention.cc index 01960803ce..488517a02e 100644 --- a/lib/task-spec/src/task-spec/ops/attention.cc +++ b/lib/task-spec/src/task-spec/ops/attention.cc @@ -85,10 +85,10 @@ static DeviceSpecificDeviceStates init_task_impl(TaskArgumentAccessor const &acc) { auto const &attrs = acc.get_argument(ATTRS); Allocator allocator = acc.get_allocator(); - nonnegative_int qProjSize = acc.get_argument(QPROJSIZE); - nonnegative_int kProjSize = acc.get_argument(KPROJSIZE); - nonnegative_int vProjSize = acc.get_argument(VPROJSIZE); - nonnegative_int oProjSize = acc.get_argument(OPROJSIZE); + positive_int qProjSize = acc.get_argument(QPROJSIZE); + positive_int kProjSize = acc.get_argument(KPROJSIZE); + positive_int vProjSize = acc.get_argument(VPROJSIZE); + positive_int oProjSize = acc.get_argument(OPROJSIZE); PerDeviceFFHandle handle = acc.get_argument(HANDLE); ParallelTensorShape query_parallel_tensor_shape = @@ -108,29 +108,29 @@ static DeviceSpecificDeviceStates key_parallel_tensor_shape, value_parallel_tensor_shape)); - nonnegative_int kvSeqLength = get_kvSeqLength(parsed); - nonnegative_int qSize = get_qSize(parsed); - nonnegative_int kSize = get_kSize(parsed); - nonnegative_int vSize = get_vSize(parsed); + positive_int kvSeqLength = get_kvSeqLength(parsed); + positive_int qSize = get_qSize(parsed); + positive_int kSize = get_kSize(parsed); + positive_int vSize = get_vSize(parsed); - nonnegative_int qoSeqLength = get_qoSeqLength(parsed); - nonnegative_int num_samples = get_num_samples(parsed); - nonnegative_int num_heads = attrs.num_heads; + positive_int qoSeqLength = get_qoSeqLength(parsed); + positive_int num_samples = get_num_samples(parsed); + positive_int num_heads = attrs.num_heads; MHAPerDeviceState per_device_state = init_kernel(handle, allocator, - num_samples.unwrap_nonnegative(), - num_heads.unwrap_nonnegative(), - qSize.unwrap_nonnegative(), - kSize.unwrap_nonnegative(), - vSize.unwrap_nonnegative(), - qProjSize.unwrap_nonnegative(), - kProjSize.unwrap_nonnegative(), - vProjSize.unwrap_nonnegative(), - oProjSize.unwrap_nonnegative(), - qoSeqLength.unwrap_nonnegative(), - kvSeqLength.unwrap_nonnegative(), + num_samples.int_from_positive_int(), + num_heads.int_from_positive_int(), + qSize.int_from_positive_int(), + kSize.int_from_positive_int(), + vSize.int_from_positive_int(), + qProjSize.int_from_positive_int(), + kProjSize.int_from_positive_int(), + vProjSize.int_from_positive_int(), + oProjSize.int_from_positive_int(), + qoSeqLength.int_from_positive_int(), + kvSeqLength.int_from_positive_int(), attrs.add_bias_kv); return DeviceSpecificDeviceStates{ DeviceSpecific::create(per_device_state)}; @@ -185,7 +185,7 @@ static std::optional assert(key_grad.shape == key.shape); assert(query_grad.shape == query.shape); - assert(weight_grad.shape.get_volume() == weight.shape.get_volume()); + assert(weight_grad.shape.num_elements() == weight.shape.num_elements()); return profile(backward_kernel, profiling, @@ -217,10 +217,10 @@ OpTaskSignature get_attention_init_signature() { init.add_arg_slot(QUERY_PARALLEL_TENSOR_SHAPE); init.add_arg_slot(KEY_PARALLEL_TENSOR_SHAPE); init.add_arg_slot(VALUE_PARALLEL_TENSOR_SHAPE); - init.add_arg_slot(QPROJSIZE); - init.add_arg_slot(KPROJSIZE); - init.add_arg_slot(VPROJSIZE); - init.add_arg_slot(OPROJSIZE); + init.add_arg_slot(QPROJSIZE); + init.add_arg_slot(KPROJSIZE); + init.add_arg_slot(VPROJSIZE); + init.add_arg_slot(OPROJSIZE); init.add_arg_slot(ATTRS); init.add_unchecked_arg_slot(HANDLE); diff --git a/lib/task-spec/src/task-spec/ops/batch_matmul.cc b/lib/task-spec/src/task-spec/ops/batch_matmul.cc index 371c80d7e2..1ee9da82d3 100644 --- a/lib/task-spec/src/task-spec/ops/batch_matmul.cc +++ b/lib/task-spec/src/task-spec/ops/batch_matmul.cc @@ -66,21 +66,21 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { FFIterationConfig iter_config = acc.get_argument(ITERATION_CONFIG); - nonnegative_int m = b_input.shape.at(legion_dim_t{0_n}); - assert(m == output.shape.at(legion_dim_t{0_n})); - nonnegative_int n = a_input.shape.at(legion_dim_t{1_n}); - assert(n == output.shape.at(legion_dim_t{1_n})); - nonnegative_int k = a_input.shape.at(legion_dim_t{0_n}); - assert(k == b_input.shape.at(legion_dim_t{1_n})); - - assert(a_input.shape.get_volume() == b_input.shape.get_volume()); - assert(a_input.shape.get_volume() == output.shape.get_volume()); - - nonnegative_int batch = 1_n; - for (nonnegative_int i : nonnegative_range(2_n, a_input.shape.get_dim())) { - nonnegative_int dim_size = a_input.shape.at(legion_dim_t{i}); - assert(dim_size == b_input.shape.at(legion_dim_t{i})); - assert(dim_size == output.shape.at(legion_dim_t{i})); + positive_int m = b_input.shape.at(legion_dim_t{0_n}); + ASSERT(m == output.shape.at(legion_dim_t{0_n})); + positive_int n = a_input.shape.at(legion_dim_t{1_n}); + ASSERT(n == output.shape.at(legion_dim_t{1_n})); + positive_int k = a_input.shape.at(legion_dim_t{0_n}); + ASSERT(k == b_input.shape.at(legion_dim_t{1_n})); + + ASSERT(a_input.shape.num_elements() == b_input.shape.num_elements()); + ASSERT(a_input.shape.num_elements() == output.shape.num_elements()); + + positive_int batch = 1_p; + for (nonnegative_int i : nonnegative_range(2_n, a_input.shape.num_dims())) { + positive_int dim_size = a_input.shape.at(legion_dim_t{i}); + ASSERT(dim_size == b_input.shape.at(legion_dim_t{i})); + ASSERT(dim_size == output.shape.at(legion_dim_t{i})); batch *= dim_size; } @@ -97,10 +97,10 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { output.get_float_ptr(), a_input.get_float_ptr(), b_input.get_float_ptr(), - m.unwrap_nonnegative(), - n.unwrap_nonnegative(), - k.unwrap_nonnegative(), - batch.unwrap_nonnegative(), + m.int_from_positive_int(), + n.int_from_positive_int(), + k.int_from_positive_int(), + batch.int_from_positive_int(), get_raw_seq_len(attrs.a_seq_length_dim), get_raw_seq_len(attrs.b_seq_length_dim), iter_config.seq_length); @@ -116,31 +116,31 @@ static std::optional auto output = acc.get_tensor(OUTPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); - assert(output.shape == output_grad.shape); + ASSERT(output.shape == output_grad.shape); auto a_input = acc.get_tensor(A_INPUT); auto a_input_grad = acc.get_tensor_grad(A_INPUT); - assert(a_input.shape == a_input_grad.shape); + ASSERT(a_input.shape == a_input_grad.shape); auto b_input = acc.get_tensor(B_INPUT); auto b_input_grad = acc.get_tensor_grad(B_INPUT); - assert(b_input.shape == b_input_grad.shape); + ASSERT(b_input.shape == b_input_grad.shape); // check dins - nonnegative_int m = b_input.shape.at(legion_dim_t{0_n}); - assert(m == output.shape.at(legion_dim_t{0_n})); - nonnegative_int n = a_input.shape.at(legion_dim_t{1_n}); - assert(n == output.shape.at(legion_dim_t{1_n})); - nonnegative_int k = a_input.shape.at(legion_dim_t{0_n}); - assert(k == b_input.shape.at(legion_dim_t{1_n})); - assert(a_input.shape.get_volume() == b_input.shape.get_volume()); - assert(a_input.shape.get_volume() == output.shape.get_volume()); - - nonnegative_int batch = 1_n; - for (nonnegative_int i : nonnegative_range(2_n, a_input.shape.get_dim())) { - nonnegative_int dim_size = a_input.shape.at(legion_dim_t{i}); - assert(dim_size == b_input.shape.at(legion_dim_t{i})); - assert(dim_size == output.shape.at(legion_dim_t{i})); + positive_int m = b_input.shape.at(legion_dim_t{0_n}); + ASSERT(m == output.shape.at(legion_dim_t{0_n})); + positive_int n = a_input.shape.at(legion_dim_t{1_n}); + ASSERT(n == output.shape.at(legion_dim_t{1_n})); + positive_int k = a_input.shape.at(legion_dim_t{0_n}); + ASSERT(k == b_input.shape.at(legion_dim_t{1_n})); + ASSERT(a_input.shape.num_elements() == b_input.shape.num_elements()); + ASSERT(a_input.shape.num_elements() == output.shape.num_elements()); + + positive_int batch = 1_p; + for (nonnegative_int i : nonnegative_range(2_n, a_input.shape.num_dims())) { + positive_int dim_size = a_input.shape.at(legion_dim_t{i}); + ASSERT(dim_size == b_input.shape.at(legion_dim_t{i})); + ASSERT(dim_size == output.shape.at(legion_dim_t{i})); batch *= dim_size; } @@ -154,10 +154,10 @@ static std::optional a_input_grad.get_float_ptr(), b_input.get_float_ptr(), b_input_grad.get_float_ptr(), - m.unwrap_nonnegative(), - n.unwrap_nonnegative(), - k.unwrap_nonnegative(), - batch.unwrap_nonnegative()); + m.int_from_positive_int(), + n.int_from_positive_int(), + k.int_from_positive_int(), + batch.int_from_positive_int()); } TaskImplFunction get_batch_matmul_fwd_task_impl() { diff --git a/lib/task-spec/src/task-spec/ops/batch_norm.cc b/lib/task-spec/src/task-spec/ops/batch_norm.cc index 2aa308dada..67c5a7d8a2 100644 --- a/lib/task-spec/src/task-spec/ops/batch_norm.cc +++ b/lib/task-spec/src/task-spec/ops/batch_norm.cc @@ -75,10 +75,10 @@ static DeviceSpecificDeviceStates auto output = acc.get_tensor(OUTPUT); auto const &attrs = acc.get_argument(ATTRS); - nonnegative_int output_w = output.shape.at(legion_dim_t{0_n}); - nonnegative_int output_h = output.shape.at(legion_dim_t{1_n}); - nonnegative_int output_c = output.shape.at(legion_dim_t{2_n}); - nonnegative_int output_n = output.shape.at(legion_dim_t{3_n}); + positive_int output_w = output.shape.at(legion_dim_t{0_n}); + positive_int output_h = output.shape.at(legion_dim_t{1_n}); + positive_int output_c = output.shape.at(legion_dim_t{2_n}); + positive_int output_n = output.shape.at(legion_dim_t{3_n}); float *runningMean; @@ -86,10 +86,10 @@ static DeviceSpecificDeviceStates init_kernel(handle, allocator, runningMean, - output_n.unwrap_nonnegative(), - output_c.unwrap_nonnegative(), - output_h.unwrap_nonnegative(), - output_w.unwrap_nonnegative(), + output_n.int_from_positive_int(), + output_c.int_from_positive_int(), + output_h.int_from_positive_int(), + output_w.int_from_positive_int(), attrs.relu); return DeviceSpecificDeviceStates{ @@ -141,7 +141,7 @@ static std::optional scale.get_float_ptr(), scale_grad.get_float_ptr(), bias_grad.get_float_ptr(), - output.shape.get_volume().unwrap_nonnegative()); + output.shape.num_elements().int_from_positive_int()); } TaskImplFunction get_batch_norm_init_task_impl() { diff --git a/lib/task-spec/src/task-spec/ops/conv_2d.cc b/lib/task-spec/src/task-spec/ops/conv_2d.cc index 47b889c6ce..ea4f7f79df 100644 --- a/lib/task-spec/src/task-spec/ops/conv_2d.cc +++ b/lib/task-spec/src/task-spec/ops/conv_2d.cc @@ -63,13 +63,13 @@ static DeviceSpecificDeviceStates Conv2DPerDeviceState per_device_state = init_kernel(/*handle=*/handle, /*activation=*/attrs.activation, - /*kernel_h=*/attrs.kernel_h.unwrap_nonnegative(), - /*kernel_w=*/attrs.kernel_w.unwrap_nonnegative(), - /*groups=*/attrs.groups.unwrap_nonnegative(), + /*kernel_h=*/attrs.kernel_h.int_from_positive_int(), + /*kernel_w=*/attrs.kernel_w.int_from_positive_int(), + /*groups=*/attrs.groups.int_from_positive_int(), /*padding_h=*/attrs.padding_h.unwrap_nonnegative(), /*padding_w=*/attrs.padding_w.unwrap_nonnegative(), - /*stride_h=*/attrs.stride_h.unwrap_nonnegative(), - /*stride_w=*/attrs.stride_w.unwrap_nonnegative(), + /*stride_h=*/attrs.stride_h.int_from_positive_int(), + /*stride_w=*/attrs.stride_w.int_from_positive_int(), /*input=*/input, /*output=*/output, /*filter_ptr=*/filter.get_float_ptr(), diff --git a/lib/task-spec/src/task-spec/ops/gather.cc b/lib/task-spec/src/task-spec/ops/gather.cc index a0bfaddc0f..5f7173a991 100644 --- a/lib/task-spec/src/task-spec/ops/gather.cc +++ b/lib/task-spec/src/task-spec/ops/gather.cc @@ -68,10 +68,10 @@ static DeviceSpecificDeviceStates legion_dim_t legion_dim = legion_dim_from_ff_dim(attrs.dim, input.shape.num_dims()); - assert(input.shape.get_dim() == index.shape.get_dim()); - assert(output.shape.get_dim() == index.shape.get_dim()); + assert(input.shape.num_dims() == index.shape.num_dims()); + assert(output.shape.num_dims() == index.shape.num_dims()); - for (nonnegative_int i : nonnegative_range(input.shape.get_dim())) { + for (nonnegative_int i : nonnegative_range(input.shape.num_dims())) { assert(index.shape.at(legion_dim_t{i}) == output.shape.at(legion_dim_t{i})); if (i != legion_dim.value) { assert(input.shape.at(legion_dim_t{i}) == diff --git a/lib/task-spec/src/task-spec/ops/layer_norm.cc b/lib/task-spec/src/task-spec/ops/layer_norm.cc index c2f16d7eda..7e6c5062e2 100644 --- a/lib/task-spec/src/task-spec/ops/layer_norm.cc +++ b/lib/task-spec/src/task-spec/ops/layer_norm.cc @@ -118,25 +118,25 @@ static DeviceSpecificDeviceStates auto input = acc.get_tensor(INPUT); auto handle = acc.get_argument(HANDLE); - nonnegative_int M = 1_n; + positive_int M = 1_p; for (int i = 0; i < attrs.axes.size(); i++) { legion_dim_t legion_dim = legion_dim_from_ff_dim(attrs.axes[i], input.shape.num_dims()); M *= input.shape.at(legion_dim); } - nonnegative_int num_replicas = 1_n; + positive_int num_replicas = 1_p; for (nonnegative_int i : nonnegative_range(input.shape.num_dims())) { num_replicas *= input.shape.at(legion_dim_t{i}); } - nonnegative_int effective_num_elements = M; - nonnegative_int effective_batch_size = input.shape.get_volume() / M; + positive_int effective_num_elements = M; + positive_int effective_batch_size = positive_int{input.shape.num_elements() / M}; LayerNormPerDeviceState per_device_state = init_kernel(handle, allocator, attrs.elementwise_affine, - effective_batch_size.unwrap_nonnegative(), - effective_num_elements.unwrap_nonnegative(), + effective_batch_size.int_from_positive_int(), + effective_num_elements.int_from_positive_int(), attrs.eps); return DeviceSpecificDeviceStates{ DeviceSpecific::create(per_device_state)}; diff --git a/lib/task-spec/src/task-spec/ops/linear.cc b/lib/task-spec/src/task-spec/ops/linear.cc index 8d4a81c5c4..3bf8080877 100644 --- a/lib/task-spec/src/task-spec/ops/linear.cc +++ b/lib/task-spec/src/task-spec/ops/linear.cc @@ -65,8 +65,8 @@ static DeviceSpecificDeviceStates auto input = acc.get_tensor(INPUT); auto weight = acc.get_tensor(WEIGHT); auto output = acc.get_tensor(OUTPUT); - nonnegative_int out_dim = output.shape.at(ff_dim_t{0_n}); - nonnegative_int batch_size = output.shape.at(ff_dim_t{1_n}); + positive_int out_dim = output.shape.at(ff_dim_t{0_n}); + positive_int batch_size = output.shape.at(ff_dim_t{1_n}); float *one_ptr; @@ -79,8 +79,8 @@ static DeviceSpecificDeviceStates input.data_type, weight.data_type, output.data_type, - batch_size.unwrap_nonnegative(), - attrs.out_channels.unwrap_nonnegative()); + batch_size.int_from_positive_int(), + attrs.out_channels.int_from_positive_int()); return DeviceSpecificDeviceStates{ DeviceSpecific::create(per_device_state)}; } @@ -95,9 +95,9 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto attrs = acc.get_argument(ATTRS); - nonnegative_int in_dim = input.shape.at(ff_dim_t{0_n}); - nonnegative_int out_dim = output.shape.at(ff_dim_t{0_n}); - nonnegative_int batch_size = output.shape.get_volume() / out_dim; + positive_int in_dim = input.shape.at(ff_dim_t{0_n}); + positive_int out_dim = output.shape.at(ff_dim_t{0_n}); + positive_int batch_size = positive_int{output.shape.num_elements() / out_dim}; float const *bias_ptr = NULL; if (attrs.use_bias) { @@ -113,9 +113,9 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { output.get_float_ptr(), weight.get_float_ptr(), bias_ptr, - in_dim.unwrap_nonnegative(), - out_dim.unwrap_nonnegative(), - batch_size.unwrap_nonnegative()); + in_dim.int_from_positive_int(), + out_dim.int_from_positive_int(), + batch_size.int_from_positive_int()); } static std::optional @@ -139,9 +139,9 @@ static std::optional bias_grad_ptr = bias_grad.get_float_ptr(); } - nonnegative_int in_dim = input.shape.at(ff_dim_t{0_n}); - nonnegative_int out_dim = output.shape.at(ff_dim_t{0_n}); - nonnegative_int batch_size = output.shape.get_volume() / out_dim; + positive_int in_dim = input.shape.at(ff_dim_t{0_n}); + positive_int out_dim = output.shape.at(ff_dim_t{0_n}); + positive_int batch_size = positive_int{output.shape.num_elements() / out_dim}; return profile(backward_kernel, profiling, @@ -154,9 +154,9 @@ static std::optional weight.get_float_ptr(), weight_grad.get_float_ptr(), bias_grad_ptr, - in_dim.unwrap_nonnegative(), - out_dim.unwrap_nonnegative(), - batch_size.unwrap_nonnegative()); + in_dim.int_from_positive_int(), + out_dim.int_from_positive_int(), + batch_size.int_from_positive_int()); } TaskImplFunction get_linear_init_task_impl() { diff --git a/lib/task-spec/src/task-spec/ops/pool_2d.cc b/lib/task-spec/src/task-spec/ops/pool_2d.cc index d7064ca04d..bceced61d3 100644 --- a/lib/task-spec/src/task-spec/ops/pool_2d.cc +++ b/lib/task-spec/src/task-spec/ops/pool_2d.cc @@ -42,32 +42,32 @@ static DeviceSpecificDeviceStates auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); - nonnegative_int input_w = input.shape.at(ff_dim_t{0_n}); - nonnegative_int input_h = input.shape.at(ff_dim_t{1_n}); - nonnegative_int input_c = input.shape.at(ff_dim_t{2_n}); - nonnegative_int input_n = input.shape.at(ff_dim_t{3_n}); - nonnegative_int output_w = output.shape.at(ff_dim_t{0_n}); - nonnegative_int output_h = output.shape.at(ff_dim_t{1_n}); - nonnegative_int output_c = output.shape.at(ff_dim_t{2_n}); - nonnegative_int output_n = output.shape.at(ff_dim_t{3_n}); + positive_int input_w = input.shape.at(ff_dim_t{0_n}); + positive_int input_h = input.shape.at(ff_dim_t{1_n}); + positive_int input_c = input.shape.at(ff_dim_t{2_n}); + positive_int input_n = input.shape.at(ff_dim_t{3_n}); + positive_int output_w = output.shape.at(ff_dim_t{0_n}); + positive_int output_h = output.shape.at(ff_dim_t{1_n}); + positive_int output_c = output.shape.at(ff_dim_t{2_n}); + positive_int output_n = output.shape.at(ff_dim_t{3_n}); Pool2DPerDeviceState per_device_state = init_kernel(handle, attrs.activation, - input_w.unwrap_nonnegative(), - input_h.unwrap_nonnegative(), - input_c.unwrap_nonnegative(), - input_n.unwrap_nonnegative(), - output_w.unwrap_nonnegative(), - output_h.unwrap_nonnegative(), - output_c.unwrap_nonnegative(), - output_n.unwrap_nonnegative(), + input_w.int_from_positive_int(), + input_h.int_from_positive_int(), + input_c.int_from_positive_int(), + input_n.int_from_positive_int(), + output_w.int_from_positive_int(), + output_h.int_from_positive_int(), + output_c.int_from_positive_int(), + output_n.int_from_positive_int(), attrs.padding_h.unwrap_nonnegative(), attrs.padding_w.unwrap_nonnegative(), - attrs.kernel_h.unwrap_nonnegative(), - attrs.kernel_w.unwrap_nonnegative(), - attrs.stride_h.unwrap_nonnegative(), - attrs.stride_w.unwrap_nonnegative(), + attrs.kernel_h.int_from_positive_int(), + attrs.kernel_w.int_from_positive_int(), + attrs.stride_h.int_from_positive_int(), + attrs.stride_w.int_from_positive_int(), attrs.pool_type); return DeviceSpecificDeviceStates{ diff --git a/lib/task-spec/src/task-spec/ops/reduce.cc b/lib/task-spec/src/task-spec/ops/reduce.cc index ccc1285aaa..3efac36c3f 100644 --- a/lib/task-spec/src/task-spec/ops/reduce.cc +++ b/lib/task-spec/src/task-spec/ops/reduce.cc @@ -40,7 +40,7 @@ static DeviceSpecificDeviceStates OperatorType op_type = attrs.op_type; nonnegative_int reduction_size = - input.shape.get_volume() / output.shape.get_volume(); + input.shape.num_elements() / output.shape.num_elements(); ReducePerDeviceState per_device_state = init_kernel(handle, op_type, diff --git a/lib/task-spec/src/task-spec/ops/reduction.cc b/lib/task-spec/src/task-spec/ops/reduction.cc index 96e2c6c506..48f4c0e98d 100644 --- a/lib/task-spec/src/task-spec/ops/reduction.cc +++ b/lib/task-spec/src/task-spec/ops/reduction.cc @@ -49,14 +49,14 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto output = acc.get_tensor(OUTPUT); auto attrs = acc.get_argument(ATTRS); - nonnegative_int num_replicas = attrs.reduction_degree; + positive_int num_replicas = attrs.reduction_degree; return profile(forward_kernel, profiling_settings, "[Reduction] forward_time = {:.2lf}ms\n", input, output, - num_replicas.unwrap_nonnegative()); + num_replicas.int_from_positive_int()); } static std::optional diff --git a/lib/task-spec/src/task-spec/ops/replicate.cc b/lib/task-spec/src/task-spec/ops/replicate.cc index 0ed5d98708..e91414bc16 100644 --- a/lib/task-spec/src/task-spec/ops/replicate.cc +++ b/lib/task-spec/src/task-spec/ops/replicate.cc @@ -68,7 +68,7 @@ static std::optional "[replicate] backward_time = {:.2lf}ms\n", output_grad, input_grad, - attrs.replicate_degree.unwrap_nonnegative()); + attrs.replicate_degree.int_from_positive_int()); } TaskImplFunction get_replicate_fwd_task_impl() { diff --git a/lib/task-spec/src/task-spec/ops/softmax.cc b/lib/task-spec/src/task-spec/ops/softmax.cc index d7b27fd884..81239d1a67 100644 --- a/lib/task-spec/src/task-spec/ops/softmax.cc +++ b/lib/task-spec/src/task-spec/ops/softmax.cc @@ -58,18 +58,18 @@ static DeviceSpecificDeviceStates auto output = acc.get_tensor(OUTPUT); auto const &attrs = acc.get_argument(ATTRS); - nonnegative_int output_w = output.shape.at(legion_dim_t{0_n}); - nonnegative_int output_h = output.shape.at(legion_dim_t{1_n}); - nonnegative_int output_c = output.shape.at(legion_dim_t{2_n}); - nonnegative_int output_n = output.shape.at(legion_dim_t{3_n}); + positive_int output_w = output.shape.at(legion_dim_t{0_n}); + positive_int output_h = output.shape.at(legion_dim_t{1_n}); + positive_int output_c = output.shape.at(legion_dim_t{2_n}); + positive_int output_n = output.shape.at(legion_dim_t{3_n}); SoftmaxPerDeviceState per_device_state = init_kernel(handle, attrs.dim.value.unwrap_nonnegative(), - output_n.unwrap_nonnegative(), - output_c.unwrap_nonnegative(), - output_h.unwrap_nonnegative(), - output_w.unwrap_nonnegative()); + output_n.int_from_positive_int(), + output_c.int_from_positive_int(), + output_h.int_from_positive_int(), + output_w.int_from_positive_int()); return DeviceSpecificDeviceStates{ DeviceSpecific::create(per_device_state)}; @@ -108,7 +108,7 @@ static std::optional "[SoftMax] backward_time = {:.2lf}ms\n", output_grad.get_float_ptr(), input_grad.get_float_ptr(), - output_grad.shape.get_volume().unwrap_nonnegative()); + output_grad.shape.num_elements().int_from_positive_int()); } TaskImplFunction get_softmax_init_task_impl() { diff --git a/lib/task-spec/src/task-spec/ops/split.cc b/lib/task-spec/src/task-spec/ops/split.cc index a14f6a587d..aa3184c999 100644 --- a/lib/task-spec/src/task-spec/ops/split.cc +++ b/lib/task-spec/src/task-spec/ops/split.cc @@ -44,11 +44,11 @@ OpTaskInvocation backward(SplitAttrs const &attrs) { return {task_id_t::SPLIT_BWD_TASK_ID, binding}; } -static std::pair +static std::pair calc_block_size(ArrayShape const &array_shape, ff_dim_t axis) { - nonnegative_int num_blocks = 1_n; - nonnegative_int block_size = 1_n; - for (nonnegative_int d : nonnegative_range(array_shape.num_elements())) { + positive_int num_blocks = 1_p; + positive_int block_size = 1_p; + for (nonnegative_int d : nonnegative_range(array_shape.num_elements().nonnegative_int_from_positive_int())) { if (d <= axis.value) { block_size *= array_shape.at(legion_dim_t{d}); } else { @@ -69,7 +69,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { for (int i = 0; i < attrs.splits.size(); i++) { auto [_, out_block_size] = calc_block_size(output.shape, attrs.axis); - out_block_sizes[i] = out_block_size.unwrap_nonnegative(); + out_block_sizes[i] = out_block_size.int_from_positive_int(); } float *output_float_ptr = output.get_float_ptr(); return profile(forward_kernel, @@ -78,8 +78,8 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { &output_float_ptr, input.get_float_ptr(), out_block_sizes, - in_block_size.unwrap_nonnegative(), - num_blocks.unwrap_nonnegative(), + in_block_size.int_from_positive_int(), + num_blocks.int_from_positive_int(), attrs.splits.size()); } @@ -98,7 +98,7 @@ static std::optional for (int i = 0; i < attrs.splits.size(); i++) { coord_t out_num_blocks; auto [_, out_block_size] = calc_block_size(output_grad.shape, attrs.axis); - out_block_sizes[i] = out_block_size.unwrap_nonnegative(); + out_block_sizes[i] = out_block_size.int_from_positive_int(); } float const *output_grad_ptr = output_grad.get_float_ptr(); return profile(backward_kernel, @@ -107,8 +107,8 @@ static std::optional input_grad.get_float_ptr(), &output_grad_ptr, out_block_sizes, - in_block_size.unwrap_nonnegative(), - num_blocks.unwrap_nonnegative(), + in_block_size.int_from_positive_int(), + num_blocks.int_from_positive_int(), attrs.splits.size()); } diff --git a/lib/task-spec/src/task-spec/ops/topk.cc b/lib/task-spec/src/task-spec/ops/topk.cc index 11f1fffa41..ea2d855bf6 100644 --- a/lib/task-spec/src/task-spec/ops/topk.cc +++ b/lib/task-spec/src/task-spec/ops/topk.cc @@ -74,8 +74,8 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); - nonnegative_int length = input.shape.at(legion_dim_t{0_n}); - nonnegative_int batch_size = input.shape.get_volume() / length; + positive_int length = input.shape.at(legion_dim_t{0_n}); + positive_int batch_size = positive_int{input.shape.num_elements() / length}; auto indices = acc.get_tensor(INDICES); return profile(forward_kernel, @@ -85,9 +85,9 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { input.get_float_ptr(), output.get_float_ptr(), indices.get_int32_ptr(), - batch_size.unwrap_nonnegative(), - length.unwrap_nonnegative(), - attrs.k.unwrap_nonnegative(), + batch_size.int_from_positive_int(), + length.int_from_positive_int(), + attrs.k.int_from_positive_int(), attrs.sorted); } @@ -103,8 +103,8 @@ static std::optional auto indices = acc.get_tensor(INDICES); - nonnegative_int length = input_grad.shape.at(legion_dim_t{0_n}); - nonnegative_int batch_size = input_grad.shape.get_volume() / length; + positive_int length = input_grad.shape.at(legion_dim_t{0_n}); + positive_int batch_size = positive_int{input_grad.shape.num_elements() / length}; return profile(backward_kernel, profiling, @@ -113,9 +113,9 @@ static std::optional output_grad.get_float_ptr(), indices.get_int32_ptr(), input_grad.get_float_ptr(), - batch_size.unwrap_nonnegative(), - length.unwrap_nonnegative(), - attrs.k.unwrap_nonnegative()); + batch_size.int_from_positive_int(), + length.int_from_positive_int(), + attrs.k.int_from_positive_int()); } TaskImplFunction get_topk_init_task_impl() { diff --git a/lib/task-spec/test/src/task-spec/arg_ref.cc b/lib/task-spec/test/src/task-spec/arg_ref.cc index e1c5a9bd8d..dcc2e9e580 100644 --- a/lib/task-spec/test/src/task-spec/arg_ref.cc +++ b/lib/task-spec/test/src/task-spec/arg_ref.cc @@ -10,8 +10,6 @@ enum class ExampleLabelType { TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("ArgRefSpec::holds") { - CHECK_MESSAGE(false, "TODO: ArgRefSpec"); - ArgRefSpec arg_ref_spec = ArgRefSpec::create( ArgRef{ExampleLabelType::STRING} ); diff --git a/lib/utils/include/utils/containers/sum.h b/lib/utils/include/utils/containers/sum.h index d6061e396e..a725879f76 100644 --- a/lib/utils/include/utils/containers/sum.h +++ b/lib/utils/include/utils/containers/sum.h @@ -1,6 +1,8 @@ #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_SUM_H #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_SUM_H +#include + namespace FlexFlow { /** @@ -8,11 +10,20 @@ namespace FlexFlow { **/ template Element sum(Container const &container) { - Element result = Element{0}; + std::optional result; for (Element const &element : container) { - result += element; + if (result.has_value()) { + result.value() += element; + } else { + result = element; + } + } + + if (result.has_value()) { + return result.value(); + } else { + return Element{0}; } - return result; } } // namespace FlexFlow diff --git a/lib/utils/include/utils/nonnegative_int/ceildiv.h b/lib/utils/include/utils/nonnegative_int/ceildiv.h index 939ea3de51..e2ff0bc52a 100644 --- a/lib/utils/include/utils/nonnegative_int/ceildiv.h +++ b/lib/utils/include/utils/nonnegative_int/ceildiv.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_CEILDIV_H #include "utils/nonnegative_int/nonnegative_int.h" + namespace FlexFlow { nonnegative_int ceildiv(nonnegative_int numerator, nonnegative_int denominator); diff --git a/lib/utils/include/utils/positive_int/ceildiv.h b/lib/utils/include/utils/positive_int/ceildiv.h new file mode 100644 index 0000000000..961e3ca298 --- /dev/null +++ b/lib/utils/include/utils/positive_int/ceildiv.h @@ -0,0 +1,12 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_POSITIVE_INT_CEILDIV_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_POSITIVE_INT_CEILDIV_H + +#include "utils/positive_int/positive_int.h" + +namespace FlexFlow { + +positive_int ceildiv(positive_int numerator, positive_int denominator); + +} // namespace FlexFlow + +#endif diff --git a/lib/utils/include/utils/positive_int/positive_int.h b/lib/utils/include/utils/positive_int/positive_int.h new file mode 100644 index 0000000000..9ff0f4da64 --- /dev/null +++ b/lib/utils/include/utils/positive_int/positive_int.h @@ -0,0 +1,114 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_POSITIVE_INT_POSITIVE_INT_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_POSITIVE_INT_POSITIVE_INT_H + +#include "utils/nonnegative_int/nonnegative_int.h" + +namespace FlexFlow { + +struct positive_int { + positive_int() = delete; + explicit positive_int(int value); + explicit positive_int(size_t value); + explicit positive_int(nonnegative_int value); + + explicit operator int() const noexcept; + explicit operator nonnegative_int() const noexcept; + + bool operator<(positive_int other) const; + bool operator==(positive_int other) const; + bool operator>(positive_int other) const; + bool operator<=(positive_int other) const; + bool operator!=(positive_int other) const; + bool operator>=(positive_int other) const; + + bool operator<(nonnegative_int other) const; + bool operator==(nonnegative_int other) const; + bool operator>(nonnegative_int other) const; + bool operator<=(nonnegative_int other) const; + bool operator!=(nonnegative_int other) const; + bool operator>=(nonnegative_int other) const; + + friend bool operator<(nonnegative_int lhs, positive_int rhs); + friend bool operator==(nonnegative_int lhs, positive_int rhs); + friend bool operator>(nonnegative_int lhs, positive_int rhs); + friend bool operator<=(nonnegative_int lhs, positive_int rhs); + friend bool operator!=(nonnegative_int lhs, positive_int rhs); + friend bool operator>=(nonnegative_int lhs, positive_int rhs); + + bool operator<(int other) const; + bool operator==(int other) const; + bool operator>(int other) const; + bool operator<=(int other) const; + bool operator!=(int other) const; + bool operator>=(int other) const; + + friend bool operator<(int lhs, positive_int rhs); + friend bool operator==(int lhs, positive_int rhs); + friend bool operator>(int lhs, positive_int rhs); + friend bool operator<=(int lhs, positive_int rhs); + friend bool operator!=(int lhs, positive_int rhs); + friend bool operator>=(int lhs, positive_int rhs); + + positive_int operator+(positive_int other) const; + positive_int operator+(nonnegative_int other) const; + positive_int &operator++(); + positive_int operator++(int); + positive_int &operator+=(positive_int other); + positive_int &operator+=(nonnegative_int other); + + positive_int operator*(positive_int other) const; + positive_int &operator*=(positive_int other); + nonnegative_int operator*(nonnegative_int other) const; + + friend nonnegative_int operator*(nonnegative_int lhs, positive_int rhs); + + nonnegative_int operator/(positive_int other) const; + friend nonnegative_int operator/(nonnegative_int lhs, positive_int rhs); + + friend float operator/(float lhs, positive_int rhs); + friend float &operator/=(float &lhs, positive_int rhs); + + nonnegative_int operator%(positive_int other) const; + nonnegative_int operator%(nonnegative_int other) const; + + int int_from_positive_int() const; + nonnegative_int nonnegative_int_from_positive_int() const; + + friend std::ostream &operator<<(std::ostream &os, positive_int n); + + friend int format_as(positive_int); + +private: + void check_invariant() const; + +private: + int value_; +}; + +positive_int operator""_p(unsigned long long int); + +} // namespace FlexFlow + +namespace nlohmann { +template <> +struct adl_serializer<::FlexFlow::positive_int> { + static ::FlexFlow::positive_int from_json(json const &j); + static void to_json(json &j, ::FlexFlow::positive_int t); +}; +} // namespace nlohmann + +namespace rc { +template <> +struct Arbitrary<::FlexFlow::positive_int> { + static Gen<::FlexFlow::positive_int> arbitrary(); +}; +} // namespace rc + +namespace std { +template <> +struct hash<::FlexFlow::positive_int> { + std::size_t operator()(FlexFlow::positive_int n) const noexcept; +}; +} // namespace std + +#endif diff --git a/lib/utils/src/utils/nonnegative_int/ceildiv.cc b/lib/utils/src/utils/nonnegative_int/ceildiv.cc deleted file mode 100644 index f1115b25b5..0000000000 --- a/lib/utils/src/utils/nonnegative_int/ceildiv.cc +++ /dev/null @@ -1,20 +0,0 @@ -#include "utils/nonnegative_int/ceildiv.h" -#include "utils/exception.h" - -namespace FlexFlow { - -nonnegative_int ceildiv(nonnegative_int numerator, - nonnegative_int denominator) { - if (denominator == 0) { - throw mk_runtime_error(fmt::format( - "ceildiv expected denominator != 0, but received {}", denominator)); - } - - int n = numerator.unwrap_nonnegative(); - int d = denominator.unwrap_nonnegative(); - - int result = (n + d - 1) / d; - return nonnegative_int{result}; -} - -} // namespace FlexFlow diff --git a/lib/utils/src/utils/positive_int/ceildiv.cc b/lib/utils/src/utils/positive_int/ceildiv.cc new file mode 100644 index 0000000000..b642db4edd --- /dev/null +++ b/lib/utils/src/utils/positive_int/ceildiv.cc @@ -0,0 +1,14 @@ +#include "utils/positive_int/ceildiv.h" +#include "utils/exception.h" + +namespace FlexFlow { + +positive_int ceildiv(positive_int numerator, positive_int denominator) { + int n = numerator.int_from_positive_int(); + int d = denominator.int_from_positive_int(); + + int result = (n + d - 1) / d; + return positive_int{result}; +} + +} // namespace FlexFlow diff --git a/lib/utils/src/utils/positive_int/positive_int.cc b/lib/utils/src/utils/positive_int/positive_int.cc new file mode 100644 index 0000000000..70233e74d8 --- /dev/null +++ b/lib/utils/src/utils/positive_int/positive_int.cc @@ -0,0 +1,283 @@ +#include "utils/positive_int/positive_int.h" +#include + +namespace FlexFlow { + +positive_int::positive_int(int value) + : value_(value) +{ + this->check_invariant(); +} + +positive_int::positive_int(size_t value) + : value_(value) +{ + this->check_invariant(); +} + +positive_int::positive_int(nonnegative_int value) + : value_(value.unwrap_nonnegative()) +{ + this->check_invariant(); +} + +positive_int::operator int() const noexcept { + return this->value_; +} + +positive_int::operator nonnegative_int() const noexcept { + return nonnegative_int{this->value_}; +} + +bool positive_int::operator<(positive_int other) const { + return this->value_ < other.value_; +} + +bool positive_int::operator==(positive_int other) const { + return this->value_ == other.value_; +} + +bool positive_int::operator>(positive_int other) const { + return this->value_ > other.value_; +} + +bool positive_int::operator<=(positive_int other) const { + return this->value_ <= other.value_; +} + +bool positive_int::operator!=(positive_int other) const { + return this->value_ != other.value_; +} + +bool positive_int::operator>=(positive_int other) const { + return this->value_ >= other.value_; +} + +bool positive_int::operator<(nonnegative_int other) const { + return this->value_ < other; +} + +bool positive_int::operator==(nonnegative_int other) const { + return this->value_ == other; +} + +bool positive_int::operator>(nonnegative_int other) const { + return this->value_ > other; +} + +bool positive_int::operator<=(nonnegative_int other) const { + return this->value_ <= other; +} + +bool positive_int::operator!=(nonnegative_int other) const { + return this->value_ != other; +} + +bool positive_int::operator>=(nonnegative_int other) const { + return this->value_ >= other; +} + +bool operator<(nonnegative_int lhs, positive_int rhs) { + return lhs < rhs.value_; +} + +bool operator==(nonnegative_int lhs, positive_int rhs) { + return lhs == rhs.value_; +} + +bool operator>(nonnegative_int lhs, positive_int rhs) { + return lhs > rhs.value_; +} + +bool operator<=(nonnegative_int lhs, positive_int rhs) { + return lhs <= rhs.value_; +} + +bool operator!=(nonnegative_int lhs, positive_int rhs) { + return lhs != rhs.value_; +} + +bool operator>=(nonnegative_int lhs, positive_int rhs) { + return lhs >= rhs.value_; +} + +bool positive_int::operator<(int other) const { + return this->value_ < other; +} + +bool positive_int::operator==(int other) const { + return this->value_ == other; +} + +bool positive_int::operator>(int other) const { + return this->value_ > other; +} + +bool positive_int::operator<=(int other) const { + return this->value_ <= other; +} + +bool positive_int::operator!=(int other) const { + return this->value_ != other; +} + +bool positive_int::operator>=(int other) const { + return this->value_ >= other; +} + +bool operator<(int lhs, positive_int rhs) { + return lhs < rhs.value_; +} + +bool operator==(int lhs, positive_int rhs) { + return lhs == rhs.value_; +} + +bool operator>(int lhs, positive_int rhs) { + return lhs > rhs.value_; +} + +bool operator<=(int lhs, positive_int rhs) { + return lhs <= rhs.value_; +} + +bool operator!=(int lhs, positive_int rhs) { + return lhs != rhs.value_; +} + +bool operator>=(int lhs, positive_int rhs) { + return lhs >= rhs.value_; +} + +positive_int positive_int::operator+(positive_int other) const { + return positive_int{this->value_ + other.value_}; +} + +positive_int positive_int::operator+(nonnegative_int other) const { + return positive_int{this->value_ + other.unwrap_nonnegative()}; +} + +positive_int &positive_int::operator++() { + this->value_++; + this->check_invariant(); + return *this; +} + +positive_int positive_int::operator++(int) { + positive_int result = *this; + this->value_++; + this->check_invariant(); + return result; +} + +positive_int &positive_int::operator+=(positive_int other) { + this->value_ += other.value_; + this->check_invariant(); + return *this; +} + +positive_int &positive_int::operator+=(nonnegative_int other) { + this->value_ += other.unwrap_nonnegative(); + this->check_invariant(); + return *this; +} + +positive_int positive_int::operator*(positive_int other) const { + return positive_int{this->value_ * other.value_}; +} + +positive_int &positive_int::operator*=(positive_int other) { + this->value_ *= other.value_; + this->check_invariant(); + return *this; +} + +nonnegative_int positive_int::operator*(nonnegative_int other) const { + return other * *this; +} + + +nonnegative_int operator*(nonnegative_int lhs, positive_int rhs) { + return lhs * rhs.nonnegative_int_from_positive_int(); +} + +nonnegative_int positive_int::operator/(positive_int other) const { + return nonnegative_int{this->value_ / other.value_}; +} + +nonnegative_int operator/(nonnegative_int lhs, positive_int rhs) { + return nonnegative_int{lhs.unwrap_nonnegative() / rhs.value_}; +} + +float operator/(float lhs, positive_int rhs) { + return lhs / rhs.value_; +} + +float &operator/=(float &lhs, positive_int rhs) { + return (lhs /= rhs.value_); +} + +nonnegative_int positive_int::operator%(positive_int other) const { + return nonnegative_int{this->value_ % other.value_}; +} + +nonnegative_int positive_int::operator%(nonnegative_int other) const { + return nonnegative_int{this->value_ % other.unwrap_nonnegative()}; +} + +int positive_int::int_from_positive_int() const { + return this->value_; +} + +nonnegative_int positive_int::nonnegative_int_from_positive_int() const { + return nonnegative_int{this->value_}; +} + +std::ostream &operator<<(std::ostream &os, positive_int n) { + os << n.value_; + return os; +} + +int format_as(positive_int x) { + return x.value_; +} + +void positive_int::check_invariant() const { + ASSERT(this->value_ > 0); +} + +positive_int operator""_p(unsigned long long int x) { + ASSERT(x <= static_cast(std::numeric_limits::max())); + + return positive_int{static_cast(x)}; + +} + +} // namespace FlexFlow + +namespace nlohmann { +::FlexFlow::positive_int + adl_serializer<::FlexFlow::positive_int>::from_json(json const &j) { + return ::FlexFlow::positive_int{j.template get()}; +} + +void adl_serializer<::FlexFlow::positive_int>::to_json( + json &j, ::FlexFlow::positive_int t) { + j = t.int_from_positive_int(); +} +} // namespace nlohmann + +namespace rc { +Gen<::FlexFlow::positive_int> + Arbitrary<::FlexFlow::positive_int>::arbitrary() { + return gen::construct<::FlexFlow::positive_int>(gen::positive()); +} +} // namespace rc + +namespace std { +std::size_t hash<::FlexFlow::positive_int>::operator()( + FlexFlow::positive_int n) const noexcept { + return std::hash{}(n.int_from_positive_int()); +} + +} // namespace std diff --git a/lib/utils/test/src/utils/containers/sum.cc b/lib/utils/test/src/utils/containers/sum.cc index 32d8cd32a3..2e335b1051 100644 --- a/lib/utils/test/src/utils/containers/sum.cc +++ b/lib/utils/test/src/utils/containers/sum.cc @@ -1,6 +1,7 @@ #include "utils/containers/sum.h" #include #include +#include "utils/positive_int/positive_int.h" using namespace ::FlexFlow; @@ -24,4 +25,21 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(result == correct); } } + + TEST_CASE("sum(std::vector)") { + SUBCASE("returns the sum if the input is not empty") { + std::vector input = {3_p, 9_p, 3_p}; + + positive_int result = sum(input); + positive_int correct = 15_p; + + CHECK(result == correct); + } + + SUBCASE("throws an error if the input is empty, as then 0 should be returned") { + std::vector input = {}; + + CHECK_THROWS(sum(input)); + } + } } diff --git a/lib/utils/test/src/utils/nonnegative_int/ceildiv.cc b/lib/utils/test/src/utils/nonnegative_int/ceildiv.cc deleted file mode 100644 index 7ac882ff9f..0000000000 --- a/lib/utils/test/src/utils/nonnegative_int/ceildiv.cc +++ /dev/null @@ -1,52 +0,0 @@ -#include "utils/nonnegative_int/ceildiv.h" -#include - -using namespace ::FlexFlow; - -TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("ceildiv(nonnegative_int, nonnegative_int)") { - SUBCASE("divides evenly") { - nonnegative_int numerator = 12_n; - nonnegative_int denominator = 3_n; - - nonnegative_int result = ceildiv(numerator, denominator); - nonnegative_int correct = 4_n; - - CHECK(result == correct); - } - - SUBCASE("does not divide evenly") { - nonnegative_int numerator = 17_n; - nonnegative_int denominator = 4_n; - - nonnegative_int result = ceildiv(numerator, denominator); - nonnegative_int correct = 5_n; - - CHECK(result == correct); - } - - SUBCASE("denominator is zero") { - nonnegative_int numerator = 15_n; - nonnegative_int denominator = 0_n; - - CHECK_THROWS(ceildiv(numerator, denominator)); - } - - SUBCASE("numerator is zero") { - nonnegative_int numerator = 0_n; - nonnegative_int denominator = 1_n; - - nonnegative_int result = ceildiv(numerator, denominator); - nonnegative_int correct = 0_n; - - CHECK(result == correct); - } - - SUBCASE("denominator and numerator are zero") { - nonnegative_int numerator = 0_n; - nonnegative_int denominator = 0_n; - - CHECK_THROWS(ceildiv(numerator, denominator)); - } - } -} diff --git a/lib/utils/test/src/utils/positive_int/ceildiv.cc b/lib/utils/test/src/utils/positive_int/ceildiv.cc new file mode 100644 index 0000000000..7c37e06d4d --- /dev/null +++ b/lib/utils/test/src/utils/positive_int/ceildiv.cc @@ -0,0 +1,28 @@ +#include "utils/positive_int/ceildiv.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("ceildiv(positive_int, positive_int)") { + SUBCASE("divides evenly") { + positive_int numerator = 12_p; + positive_int denominator = 3_p; + + positive_int result = ceildiv(numerator, denominator); + positive_int correct = 4_p; + + CHECK(result == correct); + } + + SUBCASE("does not divide evenly") { + positive_int numerator = 17_p; + positive_int denominator = 4_p; + + positive_int result = ceildiv(numerator, denominator); + positive_int correct = 5_p; + + CHECK(result == correct); + } + } +} diff --git a/lib/utils/test/src/utils/positive_int/positive_int.cc b/lib/utils/test/src/utils/positive_int/positive_int.cc new file mode 100644 index 0000000000..25348d34da --- /dev/null +++ b/lib/utils/test/src/utils/positive_int/positive_int.cc @@ -0,0 +1,10 @@ +#include +#include "utils/positive_int/positive_int.h" + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("positive_int") { + CHECK_MESSAGE(false, "TODO: positive_int"); + } +} From a266a79e1092f395c14cbb3225d932389a293621 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Wed, 21 May 2025 08:20:50 +0000 Subject: [PATCH 74/91] Format --- .../src/compiler/allowed_machine_views.cc | 19 +- ...ion_graph_series_parallel_decomposition.cc | 11 +- lib/kernels/include/kernels/accessor.h | 1 - lib/kernels/include/kernels/array_coord.h | 5 +- lib/kernels/include/kernels/array_shape.h | 5 +- .../kernels/compare_tensor_accessors.h | 52 +++--- .../kernels/create_accessor_with_contents.h | 36 ++-- .../include/kernels/fill_tensor_accessor.h | 10 +- lib/kernels/include/kernels/legion_dim.h | 4 +- .../kernels/managed_per_device_ff_handle.h | 14 +- .../include/kernels/map_tensor_accessors.h | 60 ++++--- .../include/kernels/reduce_tensor_accessor.h | 66 ++++--- lib/kernels/src/cpu/ops/combine_kernels.cc | 3 +- lib/kernels/src/cpu/ops/replicate_kernels.cc | 4 +- lib/kernels/src/cuda/ops/combine_kernels.cu | 16 +- lib/kernels/src/cuda/ops/gather_kernels.cu | 3 +- lib/kernels/src/cuda/ops/partition_kernels.cu | 13 +- lib/kernels/src/cuda/ops/reduction_kernels.cu | 13 +- lib/kernels/src/cuda/ops/replicate_kernels.cu | 13 +- lib/kernels/src/cuda/ops/reshape_kernels.cu | 13 +- lib/kernels/src/cuda/ops/transpose_kernels.cu | 4 +- lib/kernels/src/cuda/optimizer_kernels.cu | 22 +-- lib/kernels/src/kernels/accessor.cc | 13 +- lib/kernels/src/kernels/array_coord.cc | 5 +- lib/kernels/src/kernels/array_shape.cc | 19 +- .../src/kernels/compare_tensor_accessors.cc | 104 ++++++----- .../kernels/create_accessor_with_contents.cc | 49 ++---- .../src/kernels/fill_tensor_accessor.cc | 14 +- .../src/kernels/format_accessor_contents.cc | 46 +++-- .../src/kernels/map_tensor_accessors.cc | 26 +-- .../src/kernels/reduce_tensor_accessor.cc | 15 +- .../src/kernels/tensor_accessor_reductions.cc | 18 +- .../src/managed_per_device_ff_handle.cc | 34 ++-- .../test/src/cpu/ops/replicate_kernels.cc | 5 +- .../test/src/cpu/ops/reverse_kernels.cc | 130 +++++++------- lib/kernels/test/src/internal/test_utils.cc | 9 +- lib/kernels/test/src/kernels/accessor.cc | 28 +-- lib/kernels/test/src/kernels/array_coord.cc | 27 +-- lib/kernels/test/src/kernels/array_shape.cc | 25 ++- .../src/kernels/compare_tensor_accessors.cc | 166 +++++++++--------- .../kernels/create_accessor_with_contents.cc | 89 +++++----- .../src/kernels/format_accessor_contents.cc | 55 +++--- .../test/src/kernels/map_tensor_accessors.cc | 49 ++++-- .../src/kernels/reduce_tensor_accessor.cc | 78 ++++---- .../src/kernels/tensor_accessor_reductions.cc | 106 +++++------ lib/kernels/test/src/test_attention_kernel.cc | 5 +- .../test/src/test_batch_matmul_kernel.cc | 5 +- .../test/src/test_batch_norm_kernel.cc | 5 +- lib/kernels/test/src/test_combine_kernel.cc | 5 +- lib/kernels/test/src/test_concat_kernel.cc | 5 +- lib/kernels/test/src/test_dropout.cc | 5 +- lib/kernels/test/src/test_flat_kernel.cc | 5 +- lib/kernels/test/src/test_gather_kernels.cc | 5 +- .../test/src/test_layer_norm_kernels.cc | 5 +- .../test/src/test_managed_ff_stream.cc | 5 +- .../src/test_managed_per_device_ff_handle.cc | 16 +- lib/kernels/test/src/test_partition_kernel.cc | 5 +- lib/kernels/test/src/test_pool_2d_kernels.cc | 41 +++-- lib/kernels/test/src/test_reduction_kernel.cc | 5 +- lib/kernels/test/src/test_replicate_kernel.cc | 32 ++-- lib/kernels/test/src/test_reshape_kernel.cc | 5 +- lib/kernels/test/src/test_reverse_kernels.cc | 10 +- lib/kernels/test/src/test_softmax_kernel.cc | 5 +- lib/kernels/test/src/test_split_kernel.cc | 5 +- lib/kernels/test/src/test_transpose_kernel.cc | 5 +- .../local_task_argument_accessor.h | 2 +- .../include/local-execution/loss_functions.h | 2 +- .../include/local-execution/optimizer.h | 2 +- lib/local-execution/src/allocated_tensors.cc | 11 +- .../src/local_training_backing.cc | 2 +- lib/local-execution/src/loss_functions.cc | 24 ++- lib/local-execution/src/task_registry.cc | 2 +- .../test/src/test_allocated_tensors.cc | 11 +- lib/local-execution/test/src/test_e2e.cc | 42 ++--- .../test/src/test_local_cost_estimator.cc | 5 +- .../test/src/test_local_task_arg_accessor.cc | 3 +- .../test/src/test_local_tensor_backing.cc | 6 +- .../test/src/test_loss_functions.cc | 17 +- .../test/src/test_task_registry.cc | 2 +- .../test/src/test_unallocated_tensors.cc | 11 +- lib/local-execution/test/src/test_update.cc | 11 +- lib/models/src/models/dlrm/dlrm.cc | 22 +-- lib/op-attrs/include/op-attrs/datatype.h | 33 ++-- .../initializers/kaiming_initializer_mode.h | 2 +- .../include/op-attrs/parallel_tensor_dims.h | 10 +- .../include/op-attrs/parallel_tensor_shape.h | 13 +- .../op-attrs/replica_parallel_dim_set.h | 2 +- lib/op-attrs/src/op-attrs/datatype_value.cc | 12 +- .../initializers/kaiming_initializer_mode.cc | 2 +- lib/op-attrs/src/op-attrs/ops/attention.cc | 9 +- lib/op-attrs/src/op-attrs/ops/batch_matmul.cc | 7 +- lib/op-attrs/src/op-attrs/ops/combine.cc | 6 +- lib/op-attrs/src/op-attrs/ops/concat.cc | 3 +- lib/op-attrs/src/op-attrs/ops/conv_2d.cc | 6 +- lib/op-attrs/src/op-attrs/ops/linear.cc | 4 +- lib/op-attrs/src/op-attrs/ops/pool_2d.cc | 6 +- lib/op-attrs/src/op-attrs/ops/reduction.cc | 5 +- .../src/op-attrs/parallel_tensor_dims.cc | 3 +- .../src/op-attrs/parallel_tensor_shape.cc | 7 +- .../src/op-attrs/replica_parallel_dim_set.cc | 2 +- .../test/src/op-attrs/ops/attention.cc | 30 +--- lib/op-attrs/test/src/op-attrs/ops/cast.cc | 10 +- lib/op-attrs/test/src/op-attrs/ops/combine.cc | 3 +- lib/op-attrs/test/src/op-attrs/ops/concat.cc | 3 +- lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc | 3 +- .../test/src/op-attrs/ops/embedding.cc | 10 +- lib/op-attrs/test/src/op-attrs/ops/linear.cc | 15 +- lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc | 4 +- .../test/src/op-attrs/ops/reduction.cc | 3 +- lib/op-attrs/test/src/op-attrs/tensor_dims.cc | 21 +-- lib/pcg/include/pcg/machine_specification.h | 4 +- lib/pcg/src/pcg/machine_specification.cc | 4 +- lib/pcg/src/pcg/machine_view.cc | 4 +- lib/pcg/src/pcg/operator_task_space.cc | 3 +- .../materialize_operator_from_attrs_map.cc | 3 +- .../task-spec/generic_task_impl_function.h | 2 +- .../task-spec/init_op_task_impl_function.h | 2 +- .../task-spec/itask_argument_accessor.h | 2 +- .../include/task-spec/ops/attention.h | 2 +- .../include/task-spec/ops/batch_matmul.h | 2 +- .../include/task-spec/ops/batch_norm.h | 2 +- lib/task-spec/include/task-spec/ops/cast.h | 2 +- lib/task-spec/include/task-spec/ops/combine.h | 2 +- lib/task-spec/include/task-spec/ops/concat.h | 2 +- lib/task-spec/include/task-spec/ops/conv_2d.h | 2 +- lib/task-spec/include/task-spec/ops/dropout.h | 2 +- .../include/task-spec/ops/element_binary.h | 2 +- .../include/task-spec/ops/element_unary.h | 2 +- .../include/task-spec/ops/embedding.h | 2 +- lib/task-spec/include/task-spec/ops/flat.h | 2 +- lib/task-spec/include/task-spec/ops/gather.h | 2 +- .../include/task-spec/ops/layer_norm.h | 2 +- lib/task-spec/include/task-spec/ops/linear.h | 2 +- lib/task-spec/include/task-spec/ops/pool_2d.h | 2 +- lib/task-spec/include/task-spec/ops/reduce.h | 2 +- .../include/task-spec/ops/reduction.h | 2 +- .../include/task-spec/ops/repartition.h | 2 +- .../include/task-spec/ops/replicate.h | 2 +- lib/task-spec/include/task-spec/ops/reshape.h | 2 +- lib/task-spec/include/task-spec/ops/reverse.h | 2 +- lib/task-spec/include/task-spec/ops/softmax.h | 2 +- lib/task-spec/include/task-spec/ops/split.h | 2 +- lib/task-spec/include/task-spec/ops/topk.h | 2 +- .../include/task-spec/ops/transpose.h | 2 +- .../task-spec/task_argument_accessor.h | 2 +- .../include/task-spec/task_signature_impl.h | 2 +- lib/task-spec/src/task-spec/ops/layer_norm.cc | 3 +- lib/task-spec/src/task-spec/ops/linear.cc | 2 +- lib/task-spec/src/task-spec/ops/split.cc | 3 +- lib/task-spec/src/task-spec/ops/topk.cc | 3 +- lib/task-spec/test/src/task-spec/arg_ref.cc | 12 +- .../src/utils/positive_int/positive_int.cc | 19 +- lib/utils/test/src/utils/containers/sum.cc | 11 +- .../src/utils/positive_int/positive_int.cc | 2 +- 154 files changed, 1106 insertions(+), 1114 deletions(-) diff --git a/lib/compiler/src/compiler/allowed_machine_views.cc b/lib/compiler/src/compiler/allowed_machine_views.cc index fa543e78b5..370cb5a4ec 100644 --- a/lib/compiler/src/compiler/allowed_machine_views.cc +++ b/lib/compiler/src/compiler/allowed_machine_views.cc @@ -17,10 +17,10 @@ #include "utils/containers/unordered_multiset_of.h" #include "utils/containers/unordered_set_of.h" #include "utils/containers/zip.h" -#include "utils/positive_int/ceildiv.h" #include "utils/nonnegative_int/nonnegative_range.h" #include "utils/nonnegative_int/num_elements.h" #include "utils/overload.h" +#include "utils/positive_int/ceildiv.h" namespace FlexFlow { @@ -57,7 +57,8 @@ static std::unordered_set product(transform(tensor_dims, [](positive_int num_devices) { return nonnegative_int{num_devices.int_from_positive_int() - 1}; })); - return ceildiv(total_devices, positive_int{min_num_devices_with_full_stride_volume}); + return ceildiv(total_devices, + positive_int{min_num_devices_with_full_stride_volume}); }; auto candidate_strides = [&](std::vector const &tensor_dims, @@ -66,9 +67,11 @@ static std::unordered_set positive_int max_stride_upper_bound = get_max_stride_upper_bound(tensor_dims, total_devices); - std::vector single_stride_range = - transform(nonnegative_range(1_n, max_stride_upper_bound.nonnegative_int_from_positive_int() + 1_n), - [](nonnegative_int stride) { return stride_t{positive_int{stride}}; }); + std::vector single_stride_range = transform( + nonnegative_range( + 1_n, + max_stride_upper_bound.nonnegative_int_from_positive_int() + 1_n), + [](nonnegative_int stride) { return stride_t{positive_int{stride}}; }); std::unordered_multiset> raw_stride_vectors = cartesian_product( repeat_element(/*num_times=*/num_elements(tensor_dims), @@ -83,9 +86,11 @@ static std::unordered_set auto candidate_starts = [](MachineSpecification const &ms, DeviceType const &device_type) { std::unordered_set result; - for (nonnegative_int node_idx : nonnegative_range(ms.num_nodes.nonnegative_int_from_positive_int())) { + for (nonnegative_int node_idx : + nonnegative_range(ms.num_nodes.nonnegative_int_from_positive_int())) { for (nonnegative_int device_idx : - nonnegative_range(get_num_devices_per_node(ms, device_type).nonnegative_int_from_positive_int())) { + nonnegative_range(get_num_devices_per_node(ms, device_type) + .nonnegative_int_from_positive_int())) { result.insert( MachineSpaceCoordinate{node_idx, device_idx, device_type}); } diff --git a/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc b/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc index 1c801161ca..1625d79f80 100644 --- a/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc +++ b/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc @@ -339,12 +339,11 @@ TEST_SUITE(FF_TEST_SUITE) { ComputationGraph cg = [&] { ComputationGraphBuilder b; - TensorShape input_shape = - TensorShape{TensorDims{FFOrdered{ - 10_p, - 12_p, - }}, - DataType::FLOAT}; + TensorShape input_shape = TensorShape{TensorDims{FFOrdered{ + 10_p, + 12_p, + }}, + DataType::FLOAT}; tensor_guid_t input = b.create_input(input_shape, CreateGrad::YES); b.dense(input, /*outDim=*/14_p); diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index c24695298b..eb2a431bd1 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -226,7 +226,6 @@ bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1, bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1, GenericTensorAccessorW const &acc2); - bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, ArrayShape const &expected_shape, DataType const &expected_dtype); diff --git a/lib/kernels/include/kernels/array_coord.h b/lib/kernels/include/kernels/array_coord.h index 84e68fa053..730bb49e81 100644 --- a/lib/kernels/include/kernels/array_coord.h +++ b/lib/kernels/include/kernels/array_coord.h @@ -5,8 +5,9 @@ namespace FlexFlow { -ArrayCoord array_coord_drop_dims(ArrayCoord const &coord, - std::function const &should_drop_dim); +ArrayCoord + array_coord_drop_dims(ArrayCoord const &coord, + std::function const &should_drop_dim); } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h index 355b6e5bca..2b1397dc0e 100644 --- a/lib/kernels/include/kernels/array_shape.h +++ b/lib/kernels/include/kernels/array_shape.h @@ -61,8 +61,9 @@ TensorShape get_tensor_shape(ArrayShape const &, DataType); std::unordered_set get_ff_dim_t_set(ArrayShape const &); std::unordered_set get_array_coord_set(ArrayShape const &); -ArrayShape array_shape_drop_dims(ArrayShape const &shape, - std::function const &should_drop_dim); +ArrayShape + array_shape_drop_dims(ArrayShape const &shape, + std::function const &should_drop_dim); } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/compare_tensor_accessors.h b/lib/kernels/include/kernels/compare_tensor_accessors.h index ee438505fb..c16ae0857c 100644 --- a/lib/kernels/include/kernels/compare_tensor_accessors.h +++ b/lib/kernels/include/kernels/compare_tensor_accessors.h @@ -6,29 +6,35 @@ namespace FlexFlow { -GenericTensorAccessorW compare_tensor_accessors_lt(GenericTensorAccessorR const &lhs, - GenericTensorAccessorR const &rhs, - Allocator &allocator); - -GenericTensorAccessorW compare_tensor_accessors_le(GenericTensorAccessorR const &lhs, - GenericTensorAccessorR const &rhs, - Allocator &allocator); - -GenericTensorAccessorW compare_tensor_accessors_gt(GenericTensorAccessorR const &lhs, - GenericTensorAccessorR const &rhs, - Allocator &allocator); - -GenericTensorAccessorW compare_tensor_accessors_ge(GenericTensorAccessorR const &lhs, - GenericTensorAccessorR const &rhs, - Allocator &allocator); - -GenericTensorAccessorW compare_tensor_accessors_eq(GenericTensorAccessorR const &lhs, - GenericTensorAccessorR const &rhs, - Allocator &allocator); - -GenericTensorAccessorW compare_tensor_accessors_ne(GenericTensorAccessorR const &lhs, - GenericTensorAccessorR const &rhs, - Allocator &allocator); +GenericTensorAccessorW + compare_tensor_accessors_lt(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &allocator); + +GenericTensorAccessorW + compare_tensor_accessors_le(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &allocator); + +GenericTensorAccessorW + compare_tensor_accessors_gt(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &allocator); + +GenericTensorAccessorW + compare_tensor_accessors_ge(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &allocator); + +GenericTensorAccessorW + compare_tensor_accessors_eq(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &allocator); + +GenericTensorAccessorW + compare_tensor_accessors_ne(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &allocator); } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/create_accessor_with_contents.h b/lib/kernels/include/kernels/create_accessor_with_contents.h index 966a7a30ad..9691b0c90a 100644 --- a/lib/kernels/include/kernels/create_accessor_with_contents.h +++ b/lib/kernels/include/kernels/create_accessor_with_contents.h @@ -23,7 +23,8 @@ GenericTensorAccessorW Allocator cpu_allocator = create_local_cpu_memory_allocator(); GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape); - for (nonnegative_int col_idx : nonnegative_range(ncols.nonnegative_int_from_positive_int())) { + for (nonnegative_int col_idx : + nonnegative_range(ncols.nonnegative_int_from_positive_int())) { cpu_accessor.at>(FFOrdered{col_idx}) = contents.at(col_idx.unwrap_nonnegative()); } @@ -53,11 +54,13 @@ GenericTensorAccessorW create_2d_accessor_w_with_contents( Allocator cpu_allocator = create_local_cpu_memory_allocator(); GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape); - for (nonnegative_int row_idx : nonnegative_range(nrows.nonnegative_int_from_positive_int())) { - for (nonnegative_int col_idx : nonnegative_range(ncols.nonnegative_int_from_positive_int())) { - cpu_accessor.at>(FFOrdered{row_idx, col_idx}) = - contents.at(row_idx.unwrap_nonnegative()) - .at(col_idx.unwrap_nonnegative()); + for (nonnegative_int row_idx : + nonnegative_range(nrows.nonnegative_int_from_positive_int())) { + for (nonnegative_int col_idx : + nonnegative_range(ncols.nonnegative_int_from_positive_int())) { + cpu_accessor.at>(FFOrdered{ + row_idx, col_idx}) = contents.at(row_idx.unwrap_nonnegative()) + .at(col_idx.unwrap_nonnegative()); } } @@ -95,9 +98,12 @@ GenericTensorAccessorW create_3d_accessor_w_with_contents( Allocator cpu_allocator = create_local_cpu_memory_allocator(); GenericTensorAccessorW cpu_accessor = cpu_allocator.allocate_tensor(shape); - for (nonnegative_int dim0_idx : nonnegative_range(dim0_size.nonnegative_int_from_positive_int())) { - for (nonnegative_int dim1_idx : nonnegative_range(dim1_size.nonnegative_int_from_positive_int())) { - for (nonnegative_int dim2_idx : nonnegative_range(dim2_size.nonnegative_int_from_positive_int())) { + for (nonnegative_int dim0_idx : + nonnegative_range(dim0_size.nonnegative_int_from_positive_int())) { + for (nonnegative_int dim1_idx : + nonnegative_range(dim1_size.nonnegative_int_from_positive_int())) { + for (nonnegative_int dim2_idx : + nonnegative_range(dim2_size.nonnegative_int_from_positive_int())) { cpu_accessor.at>( FFOrdered{dim0_idx, dim1_idx, dim2_idx}) = contents.at(dim0_idx.unwrap_nonnegative()) @@ -151,10 +157,14 @@ GenericTensorAccessorW create_4d_accessor_w_with_contents( GenericTensorAccessorW accessor = allocator.allocate_tensor(shape); - for (nonnegative_int dim0_idx : nonnegative_range(dim0_size.nonnegative_int_from_positive_int())) { - for (nonnegative_int dim1_idx : nonnegative_range(dim1_size.nonnegative_int_from_positive_int())) { - for (nonnegative_int dim2_idx : nonnegative_range(dim2_size.nonnegative_int_from_positive_int())) { - for (nonnegative_int dim3_idx : nonnegative_range(dim3_size.nonnegative_int_from_positive_int())) { + for (nonnegative_int dim0_idx : + nonnegative_range(dim0_size.nonnegative_int_from_positive_int())) { + for (nonnegative_int dim1_idx : + nonnegative_range(dim1_size.nonnegative_int_from_positive_int())) { + for (nonnegative_int dim2_idx : + nonnegative_range(dim2_size.nonnegative_int_from_positive_int())) { + for (nonnegative_int dim3_idx : + nonnegative_range(dim3_size.nonnegative_int_from_positive_int())) { accessor.at>( FFOrdered{dim0_idx, dim1_idx, dim2_idx, dim3_idx}) = contents.at(dim0_idx.unwrap_nonnegative()) diff --git a/lib/kernels/include/kernels/fill_tensor_accessor.h b/lib/kernels/include/kernels/fill_tensor_accessor.h index 8db63f5a2d..b10345933f 100644 --- a/lib/kernels/include/kernels/fill_tensor_accessor.h +++ b/lib/kernels/include/kernels/fill_tensor_accessor.h @@ -9,13 +9,11 @@ namespace FlexFlow { void fill_tensor_accessor(GenericTensorAccessorW &, DataTypeValue val); -GenericTensorAccessorW create_accessor_w_filled_with(TensorShape const &shape, - DataTypeValue val, - Allocator const &allocator); +GenericTensorAccessorW create_accessor_w_filled_with( + TensorShape const &shape, DataTypeValue val, Allocator const &allocator); -GenericTensorAccessorR create_accessor_r_filled_with(TensorShape const &shape, - DataTypeValue val, - Allocator const &allocator); +GenericTensorAccessorR create_accessor_r_filled_with( + TensorShape const &shape, DataTypeValue val, Allocator const &allocator); } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/legion_dim.h b/lib/kernels/include/kernels/legion_dim.h index 63c6ddb3c6..796423102b 100644 --- a/lib/kernels/include/kernels/legion_dim.h +++ b/lib/kernels/include/kernels/legion_dim.h @@ -7,9 +7,9 @@ #include "op-attrs/ff_ordered/ff_ordered.h" #include "utils/containers/set_of.h" #include "utils/containers/transform.h" -#include "utils/positive_int/positive_int.h" -#include "utils/nonnegative_int/num_elements.h" #include "utils/nonnegative_int/nonnegative_range.h" +#include "utils/nonnegative_int/num_elements.h" +#include "utils/positive_int/positive_int.h" namespace FlexFlow { diff --git a/lib/kernels/include/kernels/managed_per_device_ff_handle.h b/lib/kernels/include/kernels/managed_per_device_ff_handle.h index d409ec19ad..287369a202 100644 --- a/lib/kernels/include/kernels/managed_per_device_ff_handle.h +++ b/lib/kernels/include/kernels/managed_per_device_ff_handle.h @@ -33,12 +33,14 @@ struct ManagedPerDeviceFFHandle { PerDeviceFFHandle *handle; }; -ManagedPerDeviceFFHandle initialize_single_gpu_handle(size_t workSpaceSize, - bool allowTensorOpMathConversion); -ManagedPerDeviceFFHandle initialize_multi_gpu_handle(int num_ranks, - int my_rank, - size_t workSpaceSize, - bool allowTensorOpMathConversion); +ManagedPerDeviceFFHandle + initialize_single_gpu_handle(size_t workSpaceSize, + bool allowTensorOpMathConversion); +ManagedPerDeviceFFHandle + initialize_multi_gpu_handle(int num_ranks, + int my_rank, + size_t workSpaceSize, + bool allowTensorOpMathConversion); } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/map_tensor_accessors.h b/lib/kernels/include/kernels/map_tensor_accessors.h index eed17cbb61..2933a611cf 100644 --- a/lib/kernels/include/kernels/map_tensor_accessors.h +++ b/lib/kernels/include/kernels/map_tensor_accessors.h @@ -3,11 +3,11 @@ #include "kernels/accessor.h" #include "kernels/allocation.h" -#include "kernels/local_cpu_allocator.h" #include "kernels/copy_tensor_accessor.h" #include "kernels/datatype_dispatch.h" -#include "utils/containers/require_same.h" +#include "kernels/local_cpu_allocator.h" #include "utils/containers/require_all_same1.h" +#include "utils/containers/require_same.h" #include namespace FlexFlow { @@ -15,23 +15,21 @@ namespace FlexFlow { template struct CPUMapTensorAccessorInPlace { template - void operator()(GenericTensorAccessorW &accessor, - F &&f) { + void operator()(GenericTensorAccessorW &accessor, F &&f) { ASSERT(accessor.device_type == DeviceType::CPU); for (ArrayCoord const &coord : get_array_coord_set(accessor.shape)) { - accessor.at
(coord.ff_ordered) - = f(accessor.at
(coord.ff_ordered)); + accessor.at
(coord.ff_ordered) = f(accessor.at
(coord.ff_ordered)); } } }; template -void map_tensor_accessor_inplace(GenericTensorAccessorW &accessor, - F &&f) { +void map_tensor_accessor_inplace(GenericTensorAccessorW &accessor, F &&f) { ASSERT(accessor.device_type == DeviceType::CPU); - DataTypeDispatch1{}(accessor.data_type, accessor, f); + DataTypeDispatch1{}( + accessor.data_type, accessor, f); } template @@ -47,9 +45,8 @@ struct CPUMapTensorAccessor { for (ArrayCoord const &coord : get_array_coord_set(shape)) { output.at< - type_to_data_type_enum_v>> - >(coord.ff_ordered) - = f(input.at
(coord.ff_ordered)); + type_to_data_type_enum_v>>>( + coord.ff_ordered) = f(input.at
(coord.ff_ordered)); } } }; @@ -59,30 +56,32 @@ GenericTensorAccessorW map_tensor_accessor(GenericTensorAccessorR const &input, F &&f, Allocator &output_allocator) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); - GenericTensorAccessorR input_cpu = copy_tensor_accessor_r_to_cpu_if_necessary(input, cpu_allocator); + GenericTensorAccessorR input_cpu = + copy_tensor_accessor_r_to_cpu_if_necessary(input, cpu_allocator); - GenericTensorAccessorW output_cpu = cpu_allocator.allocate_tensor(get_tensor_shape(input.shape, type_to_data_type_enum_v)); + GenericTensorAccessorW output_cpu = cpu_allocator.allocate_tensor( + get_tensor_shape(input.shape, type_to_data_type_enum_v)); - DataTypeDispatch1{}(input.data_type, input_cpu, output_cpu, f); + DataTypeDispatch1{}( + input.data_type, input_cpu, output_cpu, f); return copy_tensor_accessor_w(output_cpu, output_allocator); } template struct CPUMapTensorAccessors2 { - template < - typename F, - typename Out = std::invoke_result_t, real_type_t> - > + template , real_type_t>> void operator()(GenericTensorAccessorR const &lhs, GenericTensorAccessorR const &rhs, GenericTensorAccessorW &output, F &&f) { ArrayShape shape = throw_if_unexpected(require_all_same1(std::vector{ - lhs.shape, - rhs.shape, - output.shape, + lhs.shape, + rhs.shape, + output.shape, })); ASSERT(lhs.device_type == DeviceType::CPU); @@ -90,8 +89,8 @@ struct CPUMapTensorAccessors2 { ASSERT(output.device_type == DeviceType::CPU); for (ArrayCoord const &coord : get_array_coord_set(shape)) { - output.at>(coord.ff_ordered) - = f(lhs.at(coord.ff_ordered), rhs.at(coord.ff_ordered)); + output.at>(coord.ff_ordered) = + f(lhs.at(coord.ff_ordered), rhs.at(coord.ff_ordered)); } } }; @@ -105,16 +104,19 @@ GenericTensorAccessorW map_tensor_accessors2(GenericTensorAccessorR const &lhs, ArrayShape shape = require_same(lhs.shape, rhs.shape); Allocator cpu_allocator = create_local_cpu_memory_allocator(); - GenericTensorAccessorR lhs_cpu = copy_tensor_accessor_r_to_cpu_if_necessary(lhs, cpu_allocator); - GenericTensorAccessorR rhs_cpu = copy_tensor_accessor_r_to_cpu_if_necessary(rhs, cpu_allocator); - GenericTensorAccessorW output_cpu = cpu_allocator.allocate_tensor(get_tensor_shape(shape, output_data_type)); + GenericTensorAccessorR lhs_cpu = + copy_tensor_accessor_r_to_cpu_if_necessary(lhs, cpu_allocator); + GenericTensorAccessorR rhs_cpu = + copy_tensor_accessor_r_to_cpu_if_necessary(rhs, cpu_allocator); + GenericTensorAccessorW output_cpu = + cpu_allocator.allocate_tensor(get_tensor_shape(shape, output_data_type)); - DataTypeDispatch2{}(lhs.data_type, rhs.data_type, lhs_cpu, rhs_cpu, output_cpu, f); + DataTypeDispatch2{}( + lhs.data_type, rhs.data_type, lhs_cpu, rhs_cpu, output_cpu, f); return copy_tensor_accessor_w(output_cpu, output_allocator); } - } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/reduce_tensor_accessor.h b/lib/kernels/include/kernels/reduce_tensor_accessor.h index 4be375299f..d803c7ef9b 100644 --- a/lib/kernels/include/kernels/reduce_tensor_accessor.h +++ b/lib/kernels/include/kernels/reduce_tensor_accessor.h @@ -4,15 +4,15 @@ #include "kernels/accessor.h" #include "kernels/allocation.h" #include "kernels/array_coord.h" +#include "kernels/copy_tensor_accessor.h" +#include "kernels/datatype_dispatch.h" +#include "kernels/local_cpu_allocator.h" #include "utils/containers/contains.h" -#include "utils/containers/sorted.h" -#include "utils/containers/group_by.h" -#include "utils/containers/transform.h" #include "utils/containers/foldl1.h" #include "utils/containers/foldr1.h" -#include "kernels/local_cpu_allocator.h" -#include "kernels/copy_tensor_accessor.h" -#include "kernels/datatype_dispatch.h" +#include "utils/containers/group_by.h" +#include "utils/containers/sorted.h" +#include "utils/containers/transform.h" namespace FlexFlow { @@ -32,18 +32,24 @@ struct CPUReduceTensorAccessorInDims { return contains(dims_to_reduce, dim); }; - std::unordered_map> output_coord_from_input_coord - = group_by(get_array_coord_set(input.shape), - [&](ArrayCoord const &input_coord) { return array_coord_drop_dims(input_coord, should_drop_dim); }); + std::unordered_map> + output_coord_from_input_coord = group_by( + get_array_coord_set(input.shape), + [&](ArrayCoord const &input_coord) { + return array_coord_drop_dims(input_coord, should_drop_dim); + }); - for (auto const &[output_coord, input_coords] : output_coord_from_input_coord) { - std::vector input_values = transform(sorted(input_coords), - [&](ArrayCoord const &input_coord) -> T { - return input.at
(input_coord.ff_ordered); - }); + for (auto const &[output_coord, input_coords] : + output_coord_from_input_coord) { + std::vector input_values = transform( + sorted(input_coords), [&](ArrayCoord const &input_coord) -> T { + return input.at
(input_coord.ff_ordered); + }); T result = foldl1(input_values, f); - ASSERT(result == foldr1(input_values, [&](T const &accum, T const &elem) { return f(elem, accum); })); + ASSERT(result == foldr1(input_values, [&](T const &accum, T const &elem) { + return f(elem, accum); + })); output.at
(output_coord.ff_ordered) = result; } @@ -51,34 +57,40 @@ struct CPUReduceTensorAccessorInDims { }; template -GenericTensorAccessorW reduce_tensor_accessor_in_dims( - GenericTensorAccessorR const &input, - std::unordered_set const &dims, - Allocator &output_allocator, - F &&f) { +GenericTensorAccessorW + reduce_tensor_accessor_in_dims(GenericTensorAccessorR const &input, + std::unordered_set const &dims, + Allocator &output_allocator, + F &&f) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); - GenericTensorAccessorR input_cpu = copy_tensor_accessor_r_to_cpu_if_necessary(input, cpu_allocator); + GenericTensorAccessorR input_cpu = + copy_tensor_accessor_r_to_cpu_if_necessary(input, cpu_allocator); auto should_drop_dim = [&](ff_dim_t dim) -> bool { return contains(dims, dim); }; - ArrayShape reduced_shape = array_shape_drop_dims(input.shape, should_drop_dim); - GenericTensorAccessorW output_cpu = cpu_allocator.allocate_tensor(get_tensor_shape(reduced_shape, input.data_type)); + ArrayShape reduced_shape = + array_shape_drop_dims(input.shape, should_drop_dim); + GenericTensorAccessorW output_cpu = cpu_allocator.allocate_tensor( + get_tensor_shape(reduced_shape, input.data_type)); - DataTypeDispatch1{}(input_cpu.data_type, input_cpu, output_cpu, dims, f); + DataTypeDispatch1{}( + input_cpu.data_type, input_cpu, output_cpu, dims, f); return copy_tensor_accessor_w(output_cpu, output_allocator); } template -real_type_t
reduce_tensor_accessor_in_all_dims(GenericTensorAccessorR const &input, - F &&f) { +real_type_t
+ reduce_tensor_accessor_in_all_dims(GenericTensorAccessorR const &input, + F &&f) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); std::unordered_set input_dims = get_ff_dim_t_set(input.shape); - GenericTensorAccessorW reduced = reduce_tensor_accessor_in_dims(input, input_dims, cpu_allocator, f); + GenericTensorAccessorW reduced = + reduce_tensor_accessor_in_dims(input, input_dims, cpu_allocator, f); return accessor_get_only_value
(reduced); } diff --git a/lib/kernels/src/cpu/ops/combine_kernels.cc b/lib/kernels/src/cpu/ops/combine_kernels.cc index 557f523f17..c0c856ae5b 100644 --- a/lib/kernels/src/cpu/ops/combine_kernels.cc +++ b/lib/kernels/src/cpu/ops/combine_kernels.cc @@ -18,7 +18,8 @@ template struct CPUBackwardKernel { void operator()(GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &input_grad) { - size_t num_elements = output_grad.shape.num_elements().int_from_positive_int(); + size_t num_elements = + output_grad.shape.num_elements().int_from_positive_int(); for (int i = 0; i < num_elements; ++i) { input_grad.get
()[i] += output_grad.get
()[i]; } diff --git a/lib/kernels/src/cpu/ops/replicate_kernels.cc b/lib/kernels/src/cpu/ops/replicate_kernels.cc index d97a274d80..bc9c4eab0d 100644 --- a/lib/kernels/src/cpu/ops/replicate_kernels.cc +++ b/lib/kernels/src/cpu/ops/replicate_kernels.cc @@ -23,8 +23,8 @@ struct CPUBackwardKernel { nonnegative_int num_replicas) { using T = real_type_t
; - for (nonnegative_int i : - nonnegative_range(num_elements.nonnegative_int_from_positive_int())) { + for (nonnegative_int i : + nonnegative_range(num_elements.nonnegative_int_from_positive_int())) { T cur_sum = 0; for (nonnegative_int replica_idx : nonnegative_range(num_replicas)) { cur_sum += output.at
(LegionOrdered{replica_idx, i}); diff --git a/lib/kernels/src/cuda/ops/combine_kernels.cu b/lib/kernels/src/cuda/ops/combine_kernels.cu index 4920696756..f091a69b71 100644 --- a/lib/kernels/src/cuda/ops/combine_kernels.cu +++ b/lib/kernels/src/cuda/ops/combine_kernels.cu @@ -27,12 +27,13 @@ struct ForwardKernel { void operator()(ffStream_t stream, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { - checkCUDA(cudaMemcpyAsync(output.get
(), - input.get
(), - input.shape.num_elements().int_from_positive_int() * - size_of_datatype(DT).int_from_positive_int(), - cudaMemcpyDeviceToDevice, - stream)); + checkCUDA( + cudaMemcpyAsync(output.get
(), + input.get
(), + input.shape.num_elements().int_from_positive_int() * + size_of_datatype(DT).int_from_positive_int(), + cudaMemcpyDeviceToDevice, + stream)); } }; @@ -41,7 +42,8 @@ struct BackwardKernel { void operator()(ffStream_t stream, GenericTensorAccessorR const &output_grad, GenericTensorAccessorW const &input_grad) { - size_t num_elements = output_grad.shape.num_elements().int_from_positive_int(); + size_t num_elements = + output_grad.shape.num_elements().int_from_positive_int(); add_kernel> <<>>( input_grad.get
(), output_grad.get
(), num_elements); diff --git a/lib/kernels/src/cuda/ops/gather_kernels.cu b/lib/kernels/src/cuda/ops/gather_kernels.cu index bee8f68eef..e251a57f8a 100644 --- a/lib/kernels/src/cuda/ops/gather_kernels.cu +++ b/lib/kernels/src/cuda/ops/gather_kernels.cu @@ -132,7 +132,8 @@ void forward_kernel(ffStream_t stream, stride = 1; } - coord_t output_dim_size = output.shape.at(m.legion_dim).int_from_positive_int(); + coord_t output_dim_size = + output.shape.at(m.legion_dim).int_from_positive_int(); coord_t input_dim_size = input.shape.at(m.legion_dim).int_from_positive_int(); assert(index.data_type == DataType::INT32 || diff --git a/lib/kernels/src/cuda/ops/partition_kernels.cu b/lib/kernels/src/cuda/ops/partition_kernels.cu index e4a83a12c8..94690a74fb 100644 --- a/lib/kernels/src/cuda/ops/partition_kernels.cu +++ b/lib/kernels/src/cuda/ops/partition_kernels.cu @@ -27,12 +27,13 @@ struct ForwardKernel { RepartitionPerDeviceState const &m, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { - checkCUDA(cudaMemcpyAsync(output.get(), - input.get(), - input.shape.num_elements().int_from_positive_int() * - size_of_datatype(T).int_from_positive_int(), - cudaMemcpyDeviceToDevice, - stream)); + checkCUDA( + cudaMemcpyAsync(output.get(), + input.get(), + input.shape.num_elements().int_from_positive_int() * + size_of_datatype(T).int_from_positive_int(), + cudaMemcpyDeviceToDevice, + stream)); } }; diff --git a/lib/kernels/src/cuda/ops/reduction_kernels.cu b/lib/kernels/src/cuda/ops/reduction_kernels.cu index ac3b7c9b08..93400d333f 100644 --- a/lib/kernels/src/cuda/ops/reduction_kernels.cu +++ b/lib/kernels/src/cuda/ops/reduction_kernels.cu @@ -57,12 +57,13 @@ struct BackwardKernel { void operator()(cudaStream_t stream, GenericTensorAccessorR const &output, GenericTensorAccessorW const &input) { - checkCUDA(cudaMemcpyAsync(input.get(), - output.get(), - input.shape.num_elements().int_from_positive_int() * - size_of_datatype(T).int_from_positive_int(), - cudaMemcpyDeviceToDevice, - stream)); + checkCUDA( + cudaMemcpyAsync(input.get(), + output.get(), + input.shape.num_elements().int_from_positive_int() * + size_of_datatype(T).int_from_positive_int(), + cudaMemcpyDeviceToDevice, + stream)); } }; diff --git a/lib/kernels/src/cuda/ops/replicate_kernels.cu b/lib/kernels/src/cuda/ops/replicate_kernels.cu index 23e65cc1f3..9f532c96b1 100644 --- a/lib/kernels/src/cuda/ops/replicate_kernels.cu +++ b/lib/kernels/src/cuda/ops/replicate_kernels.cu @@ -38,12 +38,13 @@ struct ForwardKernel { void operator()(cudaStream_t stream, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { - checkCUDA(cudaMemcpyAsync((void *)output.get(), - (void *)input.get(), - input.shape.num_elements().int_from_positive_int() * - size_of_datatype(T).int_from_positive_int(), - cudaMemcpyDeviceToDevice, - stream)); + checkCUDA( + cudaMemcpyAsync((void *)output.get(), + (void *)input.get(), + input.shape.num_elements().int_from_positive_int() * + size_of_datatype(T).int_from_positive_int(), + cudaMemcpyDeviceToDevice, + stream)); } }; diff --git a/lib/kernels/src/cuda/ops/reshape_kernels.cu b/lib/kernels/src/cuda/ops/reshape_kernels.cu index 06aa8d74b2..3f0d6bb15a 100644 --- a/lib/kernels/src/cuda/ops/reshape_kernels.cu +++ b/lib/kernels/src/cuda/ops/reshape_kernels.cu @@ -31,12 +31,13 @@ struct ForwardKernel { void operator()(cudaStream_t stream, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { - checkCUDA(cudaMemcpyAsync(output.get(), - input.get(), - input.shape.num_elements().int_from_positive_int() * - size_of_datatype(T).int_from_positive_int(), - cudaMemcpyDeviceToDevice, - stream)); + checkCUDA( + cudaMemcpyAsync(output.get(), + input.get(), + input.shape.num_elements().int_from_positive_int() * + size_of_datatype(T).int_from_positive_int(), + cudaMemcpyDeviceToDevice, + stream)); } }; diff --git a/lib/kernels/src/cuda/ops/transpose_kernels.cu b/lib/kernels/src/cuda/ops/transpose_kernels.cu index 13162a9888..4e3c69eedf 100644 --- a/lib/kernels/src/cuda/ops/transpose_kernels.cu +++ b/lib/kernels/src/cuda/ops/transpose_kernels.cu @@ -76,8 +76,8 @@ void forward_kernel(cudaStream_t stream, info.in_strides[i] = 1; info.out_strides[i] = 1; } else { - int in_dim_size = - input.shape.at(legion_dim_t{nonnegative_int{i}}).int_from_positive_int(); + int in_dim_size = input.shape.at(legion_dim_t{nonnegative_int{i}}) + .int_from_positive_int(); int out_dim_size = output.shape.at(legion_dim_t{nonnegative_int{i}}) .int_from_positive_int(); info.in_strides[i] = info.in_strides[i - 1] * in_dim_size; diff --git a/lib/kernels/src/cuda/optimizer_kernels.cu b/lib/kernels/src/cuda/optimizer_kernels.cu index e1ab7eb92c..2fce3c5db9 100644 --- a/lib/kernels/src/cuda/optimizer_kernels.cu +++ b/lib/kernels/src/cuda/optimizer_kernels.cu @@ -168,17 +168,17 @@ __host__ void adam_ps_update_task_gpu(ffStream_t stream, #ifdef FF_USE_NCCL __host__ void adam_nccl_update_task_gpu(ffStream_t stream, - float alpha_t, - float beta1, - float beta2, - float weight_decay, - float epsilon, - PerDeviceFFHandle const &handle, - float const *w_grad_ptr, - size_t size, - float *w_ptr, - float *v_ptr, - float *m_ptr) { + float alpha_t, + float beta1, + float beta2, + float weight_decay, + float epsilon, + PerDeviceFFHandle const &handle, + float const *w_grad_ptr, + size_t size, + float *w_ptr, + float *v_ptr, + float *m_ptr) { // Step 1: Use NCCL to sync gradients checkNCCL(ncclAllReduce(w_grad_ptr, (float *)w_grad_ptr, diff --git a/lib/kernels/src/kernels/accessor.cc b/lib/kernels/src/kernels/accessor.cc index 46137c3c9c..5a1881eb66 100644 --- a/lib/kernels/src/kernels/accessor.cc +++ b/lib/kernels/src/kernels/accessor.cc @@ -266,14 +266,13 @@ std::vector return get(a); } - GenericTensorAccessorR read_only_accessor_from_write_accessor( GenericTensorAccessorW const &writable) { return GenericTensorAccessorR{ - writable.data_type, - writable.shape, - writable.ptr, - writable.device_type, + writable.data_type, + writable.shape, + writable.ptr, + writable.device_type, }; } @@ -311,7 +310,7 @@ std::pair return std::make_pair(accessor.shape, accessor.data_type); } -template - int32_t accessor_get_only_value(GenericTensorAccessorR const &); +template int32_t + accessor_get_only_value(GenericTensorAccessorR const &); } // namespace FlexFlow diff --git a/lib/kernels/src/kernels/array_coord.cc b/lib/kernels/src/kernels/array_coord.cc index 60bb19351c..0927cb9951 100644 --- a/lib/kernels/src/kernels/array_coord.cc +++ b/lib/kernels/src/kernels/array_coord.cc @@ -5,8 +5,9 @@ namespace FlexFlow { -ArrayCoord array_coord_drop_dims(ArrayCoord const &coord, - std::function const &should_drop_dim) { +ArrayCoord array_coord_drop_dims( + ArrayCoord const &coord, + std::function const &should_drop_dim) { std::vector result; for (ff_dim_t idx : get_idxs(coord.ff_ordered)) { if (!should_drop_dim(idx)) { diff --git a/lib/kernels/src/kernels/array_shape.cc b/lib/kernels/src/kernels/array_shape.cc index 18b8861164..a1fb9bf09b 100644 --- a/lib/kernels/src/kernels/array_shape.cc +++ b/lib/kernels/src/kernels/array_shape.cc @@ -1,6 +1,7 @@ #include "kernels/array_shape.h" #include "kernels/legion_ordered/slice.h" #include "op-attrs/ff_ordered/ff_ordered_of.h" +#include "op-attrs/ff_ordered/get_idxs.h" #include "op-attrs/ff_ordered/slice.h" #include "utils/containers/cartesian_product.h" #include "utils/containers/product.h" @@ -10,9 +11,8 @@ #include "utils/containers/vector_of.h" #include "utils/hash/tuple.h" #include "utils/hash/vector.h" -#include "utils/nonnegative_int/num_elements.h" -#include "op-attrs/ff_ordered/get_idxs.h" #include "utils/nonnegative_int/nonnegative_range.h" +#include "utils/nonnegative_int/num_elements.h" namespace FlexFlow { @@ -109,11 +109,11 @@ std::unordered_set get_ff_dim_t_set(ArrayShape const &shape) { } std::unordered_set get_array_coord_set(ArrayShape const &shape) { - std::vector> per_dim_ranges = - transform(vector_of(ff_ordered_from_legion_ordered(shape.dims)), - [](positive_int dim_size) -> std::vector { - return nonnegative_range(dim_size.nonnegative_int_from_positive_int()); - }); + std::vector> per_dim_ranges = transform( + vector_of(ff_ordered_from_legion_ordered(shape.dims)), + [](positive_int dim_size) -> std::vector { + return nonnegative_range(dim_size.nonnegative_int_from_positive_int()); + }); std::unordered_set> raw_points = unordered_set_of(cartesian_product(per_dim_ranges)); @@ -124,8 +124,9 @@ std::unordered_set get_array_coord_set(ArrayShape const &shape) { }); } -ArrayShape array_shape_drop_dims(ArrayShape const &shape, - std::function const &should_drop_dim) { +ArrayShape array_shape_drop_dims( + ArrayShape const &shape, + std::function const &should_drop_dim) { std::vector result; for (ff_dim_t idx : get_idxs(ff_ordered_from_legion_ordered(shape.dims))) { if (!should_drop_dim(idx)) { diff --git a/lib/kernels/src/kernels/compare_tensor_accessors.cc b/lib/kernels/src/kernels/compare_tensor_accessors.cc index b1f5fd39b7..9fa9865c16 100644 --- a/lib/kernels/src/kernels/compare_tensor_accessors.cc +++ b/lib/kernels/src/kernels/compare_tensor_accessors.cc @@ -3,60 +3,76 @@ namespace FlexFlow { -GenericTensorAccessorW compare_tensor_accessors_lt(GenericTensorAccessorR const &lhs, - GenericTensorAccessorR const &rhs, - Allocator &output_allocator) { - return map_tensor_accessors2(lhs, rhs, - DataType::BOOL, - [](auto const &l, auto const &r) { return l < r; }, - output_allocator); +GenericTensorAccessorW + compare_tensor_accessors_lt(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &output_allocator) { + return map_tensor_accessors2( + lhs, + rhs, + DataType::BOOL, + [](auto const &l, auto const &r) { return l < r; }, + output_allocator); } -GenericTensorAccessorW compare_tensor_accessors_le(GenericTensorAccessorR const &lhs, - GenericTensorAccessorR const &rhs, - Allocator &output_allocator) { - return map_tensor_accessors2(lhs, rhs, - DataType::BOOL, - [](auto const &l, auto const &r) { return l <= r; }, - output_allocator); +GenericTensorAccessorW + compare_tensor_accessors_le(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &output_allocator) { + return map_tensor_accessors2( + lhs, + rhs, + DataType::BOOL, + [](auto const &l, auto const &r) { return l <= r; }, + output_allocator); } - -GenericTensorAccessorW compare_tensor_accessors_gt(GenericTensorAccessorR const &lhs, - GenericTensorAccessorR const &rhs, - Allocator &output_allocator) { - return map_tensor_accessors2(lhs, rhs, - DataType::BOOL, - [](auto const &l, auto const &r) { return l > r; }, - output_allocator); +GenericTensorAccessorW + compare_tensor_accessors_gt(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &output_allocator) { + return map_tensor_accessors2( + lhs, + rhs, + DataType::BOOL, + [](auto const &l, auto const &r) { return l > r; }, + output_allocator); } -GenericTensorAccessorW compare_tensor_accessors_ge(GenericTensorAccessorR const &lhs, - GenericTensorAccessorR const &rhs, - Allocator &output_allocator) { - return map_tensor_accessors2(lhs, rhs, - DataType::BOOL, - [](auto const &l, auto const &r) { return l >= r; }, - output_allocator); +GenericTensorAccessorW + compare_tensor_accessors_ge(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &output_allocator) { + return map_tensor_accessors2( + lhs, + rhs, + DataType::BOOL, + [](auto const &l, auto const &r) { return l >= r; }, + output_allocator); } -GenericTensorAccessorW compare_tensor_accessors_eq(GenericTensorAccessorR const &lhs, - GenericTensorAccessorR const &rhs, - Allocator &output_allocator) { - return map_tensor_accessors2(lhs, rhs, - DataType::BOOL, - [](auto const &l, auto const &r) { return l == r; }, - output_allocator); +GenericTensorAccessorW + compare_tensor_accessors_eq(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &output_allocator) { + return map_tensor_accessors2( + lhs, + rhs, + DataType::BOOL, + [](auto const &l, auto const &r) { return l == r; }, + output_allocator); } - -GenericTensorAccessorW compare_tensor_accessors_ne(GenericTensorAccessorR const &lhs, - GenericTensorAccessorR const &rhs, - Allocator &output_allocator) { - return map_tensor_accessors2(lhs, rhs, - DataType::BOOL, - [](auto const &l, auto const &r) { return l != r; }, - output_allocator); +GenericTensorAccessorW + compare_tensor_accessors_ne(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &output_allocator) { + return map_tensor_accessors2( + lhs, + rhs, + DataType::BOOL, + [](auto const &l, auto const &r) { return l != r; }, + output_allocator); } } // namespace FlexFlow diff --git a/lib/kernels/src/kernels/create_accessor_with_contents.cc b/lib/kernels/src/kernels/create_accessor_with_contents.cc index f8b85baa4a..32b61926bd 100644 --- a/lib/kernels/src/kernels/create_accessor_with_contents.cc +++ b/lib/kernels/src/kernels/create_accessor_with_contents.cc @@ -2,43 +2,32 @@ namespace FlexFlow { -template - GenericTensorAccessorW - create_1d_accessor_w_with_contents(std::vector const &, +template GenericTensorAccessorW + create_1d_accessor_w_with_contents(std::vector const &, Allocator &); + +template GenericTensorAccessorW + create_2d_accessor_w_with_contents(std::vector> const &, Allocator &); -template - GenericTensorAccessorW create_2d_accessor_w_with_contents( - std::vector> const &, Allocator &); +template GenericTensorAccessorW create_3d_accessor_w_with_contents( + std::vector>> const &, Allocator &); -template - GenericTensorAccessorW create_3d_accessor_w_with_contents( - std::vector>> const &, - Allocator &); +template GenericTensorAccessorW create_4d_accessor_w_with_contents( + std::vector>>> const &, + Allocator &); -template - GenericTensorAccessorW create_4d_accessor_w_with_contents( - std::vector>>> const &, - Allocator &); +template GenericTensorAccessorR + create_1d_accessor_r_with_contents(std::vector const &, Allocator &); -template - GenericTensorAccessorR - create_1d_accessor_r_with_contents(std::vector const &, +template GenericTensorAccessorR + create_2d_accessor_r_with_contents(std::vector> const &, Allocator &); -template - GenericTensorAccessorR create_2d_accessor_r_with_contents( - std::vector> const &, Allocator &); - -template - GenericTensorAccessorR create_3d_accessor_r_with_contents( - std::vector>> const &, - Allocator &); - -template - GenericTensorAccessorR create_4d_accessor_r_with_contents( - std::vector>>> const &, - Allocator &); +template GenericTensorAccessorR create_3d_accessor_r_with_contents( + std::vector>> const &, Allocator &); +template GenericTensorAccessorR create_4d_accessor_r_with_contents( + std::vector>>> const &, + Allocator &); } // namespace FlexFlow diff --git a/lib/kernels/src/kernels/fill_tensor_accessor.cc b/lib/kernels/src/kernels/fill_tensor_accessor.cc index f173bd0860..bee8d12556 100644 --- a/lib/kernels/src/kernels/fill_tensor_accessor.cc +++ b/lib/kernels/src/kernels/fill_tensor_accessor.cc @@ -6,21 +6,17 @@ namespace FlexFlow { void fill_tensor_accessor(GenericTensorAccessorW &accessor, DataTypeValue val) { ASSERT(accessor.device_type == DeviceType::CPU); ASSERT(accessor.data_type == get_data_type_of_data_type_value(val)); - } -GenericTensorAccessorW create_accessor_w_filled_with(TensorShape const &shape, - DataTypeValue val, - Allocator const &allocator) { +GenericTensorAccessorW create_accessor_w_filled_with( + TensorShape const &shape, DataTypeValue val, Allocator const &allocator) { NOT_IMPLEMENTED(); } -GenericTensorAccessorR create_accessor_r_filled_with(TensorShape const &shape, - DataTypeValue val, - Allocator const &allocator) { +GenericTensorAccessorR create_accessor_r_filled_with( + TensorShape const &shape, DataTypeValue val, Allocator const &allocator) { return read_only_accessor_from_write_accessor( - create_accessor_w_filled_with(shape, val, allocator)); + create_accessor_w_filled_with(shape, val, allocator)); } - } // namespace FlexFlow diff --git a/lib/kernels/src/kernels/format_accessor_contents.cc b/lib/kernels/src/kernels/format_accessor_contents.cc index 3d24483967..ed54b21cfd 100644 --- a/lib/kernels/src/kernels/format_accessor_contents.cc +++ b/lib/kernels/src/kernels/format_accessor_contents.cc @@ -3,8 +3,8 @@ #include "kernels/datatype_dispatch.h" #include "kernels/local_cpu_allocator.h" #include "utils/indent.h" -#include #include "utils/nonnegative_int/nonnegative_range.h" +#include namespace FlexFlow { @@ -19,12 +19,12 @@ struct Print1DCPUAccessorR { positive_int ncols = accessor.shape.at(ff_dim_t{0_n}); stream << "[" - << join_strings(nonnegative_range(ncols.nonnegative_int_from_positive_int()), - " ", - [&](nonnegative_int col_idx) -> std::string { - return fmt::to_string( - accessor.at
(FFOrdered{col_idx})); - }) + << join_strings( + nonnegative_range(ncols.nonnegative_int_from_positive_int()), + " ", + [&](nonnegative_int col_idx) -> std::string { + return fmt::to_string(accessor.at
(FFOrdered{col_idx})); + }) << "]"; } }; @@ -51,7 +51,8 @@ struct Print2DCPUAccessorR { auto render_1d = [&](nonnegative_int dim0_idx) -> std::string { return "[" + - join_strings(nonnegative_range(dim1_size.nonnegative_int_from_positive_int()), + join_strings(nonnegative_range( + dim1_size.nonnegative_int_from_positive_int()), " ", [&](nonnegative_int dim1_idx) -> std::string { return fmt::to_string( @@ -61,8 +62,11 @@ struct Print2DCPUAccessorR { }; stream << "[\n" - << indent( - join_strings(nonnegative_range(dim0_size.nonnegative_int_from_positive_int()), "\n", render_1d)) + << indent(join_strings( + nonnegative_range( + dim0_size.nonnegative_int_from_positive_int()), + "\n", + render_1d)) << "\n]"; } }; @@ -92,7 +96,8 @@ struct Print3DCPUAccessorR { auto render_1d = [&](nonnegative_int dim0_idx, nonnegative_int dim1_idx) -> std::string { return "[" + - join_strings(nonnegative_range(dim2_size.nonnegative_int_from_positive_int()), + join_strings(nonnegative_range( + dim2_size.nonnegative_int_from_positive_int()), " ", [&](nonnegative_int dim2_idx) -> std::string { return fmt::to_string(accessor.at
( @@ -103,17 +108,22 @@ struct Print3DCPUAccessorR { auto render_2d = [&](nonnegative_int dim0_idx) -> std::string { return "[\n" + - indent(join_strings(nonnegative_range(dim1_size.nonnegative_int_from_positive_int()), - "\n", - [&](nonnegative_int dim1_idx) -> std::string { - return render_1d(dim0_idx, dim1_idx); - })) + + indent(join_strings( + nonnegative_range( + dim1_size.nonnegative_int_from_positive_int()), + "\n", + [&](nonnegative_int dim1_idx) -> std::string { + return render_1d(dim0_idx, dim1_idx); + })) + "\n]"; }; stream << "[\n" - << indent( - join_strings(nonnegative_range(dim0_size.nonnegative_int_from_positive_int()), "\n", render_2d)) + << indent(join_strings( + nonnegative_range( + dim0_size.nonnegative_int_from_positive_int()), + "\n", + render_2d)) << "\n]"; } }; diff --git a/lib/kernels/src/kernels/map_tensor_accessors.cc b/lib/kernels/src/kernels/map_tensor_accessors.cc index c59d2207d0..77200fcefb 100644 --- a/lib/kernels/src/kernels/map_tensor_accessors.cc +++ b/lib/kernels/src/kernels/map_tensor_accessors.cc @@ -4,24 +4,26 @@ namespace FlexFlow { struct F1 { template - float operator()(T const &t) const { NOT_IMPLEMENTED(); } + float operator()(T const &t) const { + NOT_IMPLEMENTED(); + } }; -template -GenericTensorAccessorW map_tensor_accessor(GenericTensorAccessorR const &, - F1 &&, - Allocator &); +template GenericTensorAccessorW + map_tensor_accessor(GenericTensorAccessorR const &, F1 &&, Allocator &); struct F2 { template - float operator()(T1 const &lhs, T2 const &rhs) const { NOT_IMPLEMENTED(); } + float operator()(T1 const &lhs, T2 const &rhs) const { + NOT_IMPLEMENTED(); + } }; -template - GenericTensorAccessorW map_tensor_accessors2(GenericTensorAccessorR const &, - GenericTensorAccessorR const &, - DataType, - F2 &&, - Allocator &); +template GenericTensorAccessorW + map_tensor_accessors2(GenericTensorAccessorR const &, + GenericTensorAccessorR const &, + DataType, + F2 &&, + Allocator &); } // namespace FlexFlow diff --git a/lib/kernels/src/kernels/reduce_tensor_accessor.cc b/lib/kernels/src/kernels/reduce_tensor_accessor.cc index b9c4cee085..b51306d0e8 100644 --- a/lib/kernels/src/kernels/reduce_tensor_accessor.cc +++ b/lib/kernels/src/kernels/reduce_tensor_accessor.cc @@ -4,14 +4,13 @@ namespace FlexFlow { using F = std::function; -template - GenericTensorAccessorW reduce_tensor_accessor_in_dims( - GenericTensorAccessorR const &, - std::unordered_set const &, - Allocator &, - F &&); +template GenericTensorAccessorW + reduce_tensor_accessor_in_dims(GenericTensorAccessorR const &, + std::unordered_set const &, + Allocator &, + F &&); -template - int32_t reduce_tensor_accessor_in_all_dims(GenericTensorAccessorR const &, F &&); +template int32_t reduce_tensor_accessor_in_all_dims( + GenericTensorAccessorR const &, F &&); } // namespace FlexFlow diff --git a/lib/kernels/src/kernels/tensor_accessor_reductions.cc b/lib/kernels/src/kernels/tensor_accessor_reductions.cc index baeb9fadc1..b11791d32c 100644 --- a/lib/kernels/src/kernels/tensor_accessor_reductions.cc +++ b/lib/kernels/src/kernels/tensor_accessor_reductions.cc @@ -8,20 +8,22 @@ bool tensor_accessor_all(GenericTensorAccessorR const &t) { ASSERT(t.data_type == DataType::BOOL); return reduce_tensor_accessor_in_all_dims( - t, overload { - [](bool lhs, bool rhs) -> bool { return lhs && rhs; }, - [](auto lhs, auto rhs) -> bool { PANIC(); }, - }); + t, + overload{ + [](bool lhs, bool rhs) -> bool { return lhs && rhs; }, + [](auto lhs, auto rhs) -> bool { PANIC(); }, + }); } bool tensor_accessor_any(GenericTensorAccessorR const &t) { ASSERT(t.data_type == DataType::BOOL); return reduce_tensor_accessor_in_all_dims( - t, overload { - [](bool lhs, bool rhs) -> bool { return lhs || rhs; }, - [](auto lhs, auto rhs) -> bool { PANIC(); }, - }); + t, + overload{ + [](bool lhs, bool rhs) -> bool { return lhs || rhs; }, + [](auto lhs, auto rhs) -> bool { PANIC(); }, + }); } } // namespace FlexFlow diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc index 7c619bb557..305a6c935c 100644 --- a/lib/kernels/src/managed_per_device_ff_handle.cc +++ b/lib/kernels/src/managed_per_device_ff_handle.cc @@ -5,7 +5,10 @@ namespace FlexFlow { ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( - int num_ranks, int my_rank, size_t workSpaceSize, bool allowTensorOpMathConversion) { + int num_ranks, + int my_rank, + size_t workSpaceSize, + bool allowTensorOpMathConversion) { this->handle = new PerDeviceFFHandle{}; this->handle->workSpaceSize = workSpaceSize; this->handle->allowTensorOpMathConversion = allowTensorOpMathConversion; @@ -48,24 +51,27 @@ PerDeviceFFHandle const &ManagedPerDeviceFFHandle::raw_handle() const { return *handle; } -ManagedPerDeviceFFHandle initialize_single_gpu_handle(size_t workSpaceSize, bool allowTensorOpMathConversion) { +ManagedPerDeviceFFHandle + initialize_single_gpu_handle(size_t workSpaceSize, + bool allowTensorOpMathConversion) { return ManagedPerDeviceFFHandle{ - /*num_ranks=*/1, - /*my_rank=*/0, - /*workSpaceSize=*/workSpaceSize, - /*allowTensorOpMathConversion=*/allowTensorOpMathConversion, + /*num_ranks=*/1, + /*my_rank=*/0, + /*workSpaceSize=*/workSpaceSize, + /*allowTensorOpMathConversion=*/allowTensorOpMathConversion, }; } -ManagedPerDeviceFFHandle initialize_multi_gpu_handle(int num_ranks, - int my_rank, - size_t workSpaceSize, - bool allowTensorOpMathConversion) { +ManagedPerDeviceFFHandle + initialize_multi_gpu_handle(int num_ranks, + int my_rank, + size_t workSpaceSize, + bool allowTensorOpMathConversion) { return ManagedPerDeviceFFHandle{ - /*num_ranks=*/num_ranks, - /*my_rank=*/my_rank, - /*workSpaceSize=*/workSpaceSize, - /*allowTensorOpMathConversion=*/allowTensorOpMathConversion, + /*num_ranks=*/num_ranks, + /*my_rank=*/my_rank, + /*workSpaceSize=*/workSpaceSize, + /*allowTensorOpMathConversion=*/allowTensorOpMathConversion, }; } diff --git a/lib/kernels/test/src/cpu/ops/replicate_kernels.cc b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc index be1e3832ff..b98b1745d5 100644 --- a/lib/kernels/test/src/cpu/ops/replicate_kernels.cc +++ b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc @@ -41,8 +41,9 @@ TEST_SUITE(FF_TEST_SUITE) { }, cpu_allocator); - GenericTensorAccessorR correct = create_1d_accessor_r_with_contents( - {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator); + GenericTensorAccessorR correct = + create_1d_accessor_r_with_contents( + {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator); TensorShape result_shape = TensorShape{ TensorDims{FFOrdered{3_p}}, diff --git a/lib/kernels/test/src/cpu/ops/reverse_kernels.cc b/lib/kernels/test/src/cpu/ops/reverse_kernels.cc index 9e0f38c8d6..51025cd17b 100644 --- a/lib/kernels/test/src/cpu/ops/reverse_kernels.cc +++ b/lib/kernels/test/src/cpu/ops/reverse_kernels.cc @@ -1,9 +1,9 @@ #include "internal/test_utils.h" +#include "kernels/create_accessor_with_contents.h" #include "kernels/format_accessor_contents.h" #include "kernels/reverse_kernels_cpu.h" -#include "kernels/create_accessor_with_contents.h" -#include #include "test/utils/doctest/check_kv.h" +#include using namespace ::FlexFlow; @@ -36,18 +36,19 @@ TEST_SUITE(FF_TEST_SUITE) { /*axis=*/ff_dim_t{0_n}, }; - GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( - { + GenericTensorAccessorR correct = + create_3d_accessor_r_with_contents( { - {3, 3, 6}, - {2, 1, 5}, + { + {3, 3, 6}, + {2, 1, 5}, + }, + { + {1, 3, 2}, + {4, 2, 1}, + }, }, - { - {1, 3, 2}, - {4, 2, 1}, - }, - }, - cpu_allocator); + cpu_allocator); Kernels::Reverse::cpu_forward_kernel(input, result, attrs); @@ -60,18 +61,19 @@ TEST_SUITE(FF_TEST_SUITE) { /*axis=*/ff_dim_t{1_n}, }; - GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( - { - { - {4, 2, 1}, - {1, 3, 2}, - }, + GenericTensorAccessorR correct = + create_3d_accessor_r_with_contents( { - {2, 1, 5}, - {3, 3, 6}, + { + {4, 2, 1}, + {1, 3, 2}, + }, + { + {2, 1, 5}, + {3, 3, 6}, + }, }, - }, - cpu_allocator); + cpu_allocator); Kernels::Reverse::cpu_forward_kernel(input, result, attrs); @@ -84,18 +86,19 @@ TEST_SUITE(FF_TEST_SUITE) { /*axis=*/ff_dim_t{2_n}, }; - GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( - { + GenericTensorAccessorR correct = + create_3d_accessor_r_with_contents( { - {2, 3, 1}, - {1, 2, 4}, + { + {2, 3, 1}, + {1, 2, 4}, + }, + { + {6, 3, 3}, + {5, 1, 2}, + }, }, - { - {6, 3, 3}, - {5, 1, 2}, - }, - }, - cpu_allocator); + cpu_allocator); Kernels::Reverse::cpu_forward_kernel(input, result, attrs); @@ -132,18 +135,19 @@ TEST_SUITE(FF_TEST_SUITE) { /*axis=*/ff_dim_t{0_n}, }; - GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( - { - { - {3, 3, 6}, - {2, 1, 5}, - }, + GenericTensorAccessorR correct = + create_3d_accessor_r_with_contents( { - {1, 3, 2}, - {4, 2, 1}, + { + {3, 3, 6}, + {2, 1, 5}, + }, + { + {1, 3, 2}, + {4, 2, 1}, + }, }, - }, - cpu_allocator); + cpu_allocator); Kernels::Reverse::cpu_forward_kernel(input, result, attrs); @@ -156,18 +160,19 @@ TEST_SUITE(FF_TEST_SUITE) { /*axis=*/ff_dim_t{1_n}, }; - GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( - { + GenericTensorAccessorR correct = + create_3d_accessor_r_with_contents( { - {4, 2, 1}, - {1, 3, 2}, + { + {4, 2, 1}, + {1, 3, 2}, + }, + { + {2, 1, 5}, + {3, 3, 6}, + }, }, - { - {2, 1, 5}, - {3, 3, 6}, - }, - }, - cpu_allocator); + cpu_allocator); Kernels::Reverse::cpu_forward_kernel(input, result, attrs); @@ -180,18 +185,19 @@ TEST_SUITE(FF_TEST_SUITE) { /*axis=*/ff_dim_t{2_n}, }; - GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( - { - { - {2, 3, 1}, - {1, 2, 4}, - }, + GenericTensorAccessorR correct = + create_3d_accessor_r_with_contents( { - {6, 3, 3}, - {5, 1, 2}, + { + {2, 3, 1}, + {1, 2, 4}, + }, + { + {6, 3, 3}, + {5, 1, 2}, + }, }, - }, - cpu_allocator); + cpu_allocator); Kernels::Reverse::cpu_forward_kernel(input, result, attrs); diff --git a/lib/kernels/test/src/internal/test_utils.cc b/lib/kernels/test/src/internal/test_utils.cc index 1d08adb56a..a9ba8dea13 100644 --- a/lib/kernels/test/src/internal/test_utils.cc +++ b/lib/kernels/test/src/internal/test_utils.cc @@ -20,7 +20,6 @@ GenericTensorAccessorR create_zero_filled_accessor_r(TensorShape const &shape, return read_only_accessor_from_write_accessor(accessor); } - template struct CreateRandomFilledAccessorW { GenericTensorAccessorW operator()(TensorShape const &shape, @@ -82,10 +81,10 @@ struct FillWithZeros { 0, accessor.shape.num_elements().int_from_positive_int() * sizeof(T)); } else { - checkCUDA(cudaMemset(accessor.ptr, - 0, - accessor.shape.num_elements().int_from_positive_int() * - sizeof(T))); + checkCUDA(cudaMemset( + accessor.ptr, + 0, + accessor.shape.num_elements().int_from_positive_int() * sizeof(T))); } } }; diff --git a/lib/kernels/test/src/kernels/accessor.cc b/lib/kernels/test/src/kernels/accessor.cc index 2f7e908e0b..45e83cc0c6 100644 --- a/lib/kernels/test/src/kernels/accessor.cc +++ b/lib/kernels/test/src/kernels/accessor.cc @@ -1,8 +1,8 @@ #include "kernels/accessor.h" #include "internal/test_utils.h" +#include "kernels/create_accessor_with_contents.h" #include "kernels/local_cpu_allocator.h" #include -#include "kernels/create_accessor_with_contents.h" using namespace ::FlexFlow; @@ -78,25 +78,25 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("returns the value if the accessor only contains one value") { GenericTensorAccessorR input = create_3d_accessor_r_with_contents( { - { - {12}, - }, + { + {12}, + }, }, cpu_allocator); float result = accessor_get_only_value(input); - float correct = 12; + float correct = 12; CHECK(result == correct); } - - SUBCASE("throws an error if the requested type does not match the type in the accessor") { + SUBCASE("throws an error if the requested type does not match the type in " + "the accessor") { GenericTensorAccessorR input = create_3d_accessor_r_with_contents( { - { - {12}, - }, + { + {12}, + }, }, cpu_allocator); @@ -106,10 +106,10 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("throws an error if the accessor contains multiple values") { GenericTensorAccessorR input = create_3d_accessor_r_with_contents( { - { - {12}, - {12}, - }, + { + {12}, + {12}, + }, }, cpu_allocator); diff --git a/lib/kernels/test/src/kernels/array_coord.cc b/lib/kernels/test/src/kernels/array_coord.cc index 128b746a87..bbb503caf1 100644 --- a/lib/kernels/test/src/kernels/array_coord.cc +++ b/lib/kernels/test/src/kernels/array_coord.cc @@ -1,29 +1,32 @@ -#include #include "kernels/array_coord.h" +#include using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("array_coord_drop_dims") { ArrayCoord coord = ArrayCoord{ - FFOrdered{3_n, 5_n, 0_n, 1_n}, + FFOrdered{3_n, 5_n, 0_n, 1_n}, }; SUBCASE("removes dims specified to be dropped") { - std::function should_drop_dim - = [](ff_dim_t d) { return d.value % 2_n == 0_n; }; + std::function should_drop_dim = [](ff_dim_t d) { + return d.value % 2_n == 0_n; + }; ArrayCoord result = array_coord_drop_dims(coord, should_drop_dim); ArrayCoord correct = ArrayCoord{ - FFOrdered{5_n, 1_n}, + FFOrdered{5_n, 1_n}, }; CHECK(result == correct); } - SUBCASE("is identity function if no dimensions are specified to be dropped") { - std::function should_drop_dim - = [](ff_dim_t d) { return false; }; + SUBCASE( + "is identity function if no dimensions are specified to be dropped") { + std::function should_drop_dim = [](ff_dim_t d) { + return false; + }; ArrayCoord result = array_coord_drop_dims(coord, should_drop_dim); ArrayCoord correct = coord; @@ -31,9 +34,11 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(result == correct); } - SUBCASE("returns empty coord if all dimensions are specified to be dropped") { - std::function should_drop_dim - = [](ff_dim_t d) { return true; }; + SUBCASE( + "returns empty coord if all dimensions are specified to be dropped") { + std::function should_drop_dim = [](ff_dim_t d) { + return true; + }; ArrayCoord result = array_coord_drop_dims(coord, should_drop_dim); ArrayCoord correct = ArrayCoord{FFOrdered{}}; diff --git a/lib/kernels/test/src/kernels/array_shape.cc b/lib/kernels/test/src/kernels/array_shape.cc index 2665cdda36..b3ccbc688c 100644 --- a/lib/kernels/test/src/kernels/array_shape.cc +++ b/lib/kernels/test/src/kernels/array_shape.cc @@ -38,7 +38,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("array_shape_drop_dims") { ArrayShape input = ArrayShape{ - LegionOrdered{2_p, 4_p, 3_p}, + LegionOrdered{2_p, 4_p, 3_p}, }; SUBCASE("removes dims specified to be dropped") { @@ -48,16 +48,15 @@ TEST_SUITE(FF_TEST_SUITE) { ArrayShape result = array_shape_drop_dims(input, should_drop_dim); ArrayShape correct = ArrayShape{ - LegionOrdered{4_p}, + LegionOrdered{4_p}, }; CHECK(result == correct); } - SUBCASE("is identity function if no dimensions are specified to be dropped") { - auto should_drop_dim = [](ff_dim_t dim) -> bool { - return false; - }; + SUBCASE( + "is identity function if no dimensions are specified to be dropped") { + auto should_drop_dim = [](ff_dim_t dim) -> bool { return false; }; ArrayShape result = array_shape_drop_dims(input, should_drop_dim); ArrayShape correct = input; @@ -65,10 +64,9 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(result == correct); } - SUBCASE("is identity function if no dimensions are specified to be dropped") { - auto should_drop_dim = [](ff_dim_t dim) -> bool { - return false; - }; + SUBCASE( + "is identity function if no dimensions are specified to be dropped") { + auto should_drop_dim = [](ff_dim_t dim) -> bool { return false; }; ArrayShape result = array_shape_drop_dims(input, should_drop_dim); ArrayShape correct = input; @@ -76,10 +74,9 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(result == correct); } - SUBCASE("returns empty shape if all dimensions are specified to be dropped") { - auto should_drop_dim = [](ff_dim_t dim) -> bool { - return true; - }; + SUBCASE( + "returns empty shape if all dimensions are specified to be dropped") { + auto should_drop_dim = [](ff_dim_t dim) -> bool { return true; }; ArrayShape result = array_shape_drop_dims(input, should_drop_dim); ArrayShape correct = ArrayShape{LegionOrdered{}}; diff --git a/lib/kernels/test/src/kernels/compare_tensor_accessors.cc b/lib/kernels/test/src/kernels/compare_tensor_accessors.cc index 54706ad74e..85ffa91315 100644 --- a/lib/kernels/test/src/kernels/compare_tensor_accessors.cc +++ b/lib/kernels/test/src/kernels/compare_tensor_accessors.cc @@ -1,9 +1,9 @@ -#include "internal/test_utils.h" -#include #include "kernels/compare_tensor_accessors.h" +#include "internal/test_utils.h" #include "kernels/create_accessor_with_contents.h" #include "kernels/format_accessor_contents.h" #include "test/utils/doctest/check_kv.h" +#include using namespace ::FlexFlow; @@ -13,41 +13,42 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR lhs = create_3d_accessor_r_with_contents( { - { - {1, 3, 2}, - {4, 2, 1}, - }, - { - {3, 3, 6}, - {2, 1, 5}, - }, + { + {1, 3, 2}, + {4, 2, 1}, + }, + { + {3, 3, 6}, + {2, 1, 5}, + }, }, cpu_allocator); GenericTensorAccessorR rhs = create_3d_accessor_r_with_contents( { - { - {2, 3, 3}, - {5, 1, 0}, - }, - { - {1, 5, 4}, - {2, 1, 5}, - }, + { + {2, 3, 3}, + {5, 1, 0}, + }, + { + {1, 5, 4}, + {2, 1, 5}, + }, }, cpu_allocator); - GenericTensorAccessorW result = compare_tensor_accessors_lt(lhs, rhs, cpu_allocator); + GenericTensorAccessorW result = + compare_tensor_accessors_lt(lhs, rhs, cpu_allocator); GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( { - { - {true, false, true}, - {true, false, false}, - }, - { - {false, true, false}, - {false, false, false}, - }, + { + {true, false, true}, + {true, false, false}, + }, + { + {false, true, false}, + {false, false, false}, + }, }, cpu_allocator); @@ -60,35 +61,36 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR lhs = create_3d_accessor_r_with_contents( { - { - {4, 2, 1}, - }, - { - {2, 1, 5}, - }, + { + {4, 2, 1}, + }, + { + {2, 1, 5}, + }, }, cpu_allocator); GenericTensorAccessorR rhs = create_3d_accessor_r_with_contents( { - { - {5, 1, 0}, - }, - { - {2, 1, 5}, - }, + { + {5, 1, 0}, + }, + { + {2, 1, 5}, + }, }, cpu_allocator); - GenericTensorAccessorW result = compare_tensor_accessors_le(lhs, rhs, cpu_allocator); + GenericTensorAccessorW result = + compare_tensor_accessors_le(lhs, rhs, cpu_allocator); GenericTensorAccessorR correct = create_3d_accessor_r_with_contents( { - { - {true, false, false}, - }, - { - {true, true, true}, - }, + { + {true, false, false}, + }, + { + {true, true, true}, + }, }, cpu_allocator); @@ -101,23 +103,24 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR lhs = create_2d_accessor_r_with_contents( { - {4, 2, 1}, - {2, 1, 5}, + {4, 2, 1}, + {2, 1, 5}, }, cpu_allocator); GenericTensorAccessorR rhs = create_2d_accessor_r_with_contents( { - {5, 1, 0}, - {2, 1, 5}, + {5, 1, 0}, + {2, 1, 5}, }, cpu_allocator); - GenericTensorAccessorW result = compare_tensor_accessors_gt(lhs, rhs, cpu_allocator); + GenericTensorAccessorW result = + compare_tensor_accessors_gt(lhs, rhs, cpu_allocator); GenericTensorAccessorR correct = create_2d_accessor_r_with_contents( { - {false, true, true}, - {false, false, false}, + {false, true, true}, + {false, false, false}, }, cpu_allocator); @@ -130,26 +133,27 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR lhs = create_2d_accessor_r_with_contents( { - {4, 2}, - {2, 5}, - {1, 8}, + {4, 2}, + {2, 5}, + {1, 8}, }, cpu_allocator); GenericTensorAccessorR rhs = create_2d_accessor_r_with_contents( { - {5, 1}, - {3, 6}, - {1, 0}, + {5, 1}, + {3, 6}, + {1, 0}, }, cpu_allocator); - GenericTensorAccessorW result = compare_tensor_accessors_ge(lhs, rhs, cpu_allocator); + GenericTensorAccessorW result = + compare_tensor_accessors_ge(lhs, rhs, cpu_allocator); GenericTensorAccessorR correct = create_2d_accessor_r_with_contents( { - {false, true}, - {false, false}, - {true, true}, + {false, true}, + {false, false}, + {true, true}, }, cpu_allocator); @@ -162,23 +166,24 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR lhs = create_2d_accessor_r_with_contents( { - {4, 2}, - {1, 8}, + {4, 2}, + {1, 8}, }, cpu_allocator); GenericTensorAccessorR rhs = create_2d_accessor_r_with_contents( { - {5, 2}, - {1, 8}, + {5, 2}, + {1, 8}, }, cpu_allocator); - GenericTensorAccessorW result = compare_tensor_accessors_eq(lhs, rhs, cpu_allocator); + GenericTensorAccessorW result = + compare_tensor_accessors_eq(lhs, rhs, cpu_allocator); GenericTensorAccessorR correct = create_2d_accessor_r_with_contents( { - {false, true}, - {true, true}, + {false, true}, + {true, true}, }, cpu_allocator); @@ -191,26 +196,27 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorR lhs = create_2d_accessor_r_with_contents( { - {4, 2}, - {1, 8}, - {1, 2}, + {4, 2}, + {1, 8}, + {1, 2}, }, cpu_allocator); GenericTensorAccessorR rhs = create_2d_accessor_r_with_contents( { - {5, 2}, - {1, 8}, - {2, 2}, + {5, 2}, + {1, 8}, + {2, 2}, }, cpu_allocator); - GenericTensorAccessorW result = compare_tensor_accessors_ne(lhs, rhs, cpu_allocator); + GenericTensorAccessorW result = + compare_tensor_accessors_ne(lhs, rhs, cpu_allocator); GenericTensorAccessorR correct = create_2d_accessor_r_with_contents( { - {true, false}, - {false, false}, - {true, false}, + {true, false}, + {false, false}, + {true, false}, }, cpu_allocator); diff --git a/lib/kernels/test/src/kernels/create_accessor_with_contents.cc b/lib/kernels/test/src/kernels/create_accessor_with_contents.cc index a6cfdbc97f..69fa2728bf 100644 --- a/lib/kernels/test/src/kernels/create_accessor_with_contents.cc +++ b/lib/kernels/test/src/kernels/create_accessor_with_contents.cc @@ -1,5 +1,5 @@ -#include #include "kernels/create_accessor_with_contents.h" +#include using namespace ::FlexFlow; @@ -7,8 +7,8 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("create_1d_accessor_w_with_contents") { Allocator cpu_allocator = create_local_cpu_memory_allocator(); - GenericTensorAccessorW result - = create_1d_accessor_w_with_contents({1, 4, 1, 2}, cpu_allocator); + GenericTensorAccessorW result = + create_1d_accessor_w_with_contents({1, 4, 1, 2}, cpu_allocator); auto at = [&](nonnegative_int c) -> float { return result.at(FFOrdered{c}); @@ -23,13 +23,12 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("create_2d_accessor_w_with_contents") { Allocator cpu_allocator = create_local_cpu_memory_allocator(); - GenericTensorAccessorW result - = create_2d_accessor_w_with_contents( - { - {1, 4, 2}, - {2, 2, 7}, - }, - cpu_allocator); + GenericTensorAccessorW result = create_2d_accessor_w_with_contents( + { + {1, 4, 2}, + {2, 2, 7}, + }, + cpu_allocator); auto at = [&](nonnegative_int r, nonnegative_int c) -> float { return result.at(FFOrdered{r, c}); @@ -46,23 +45,23 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("create_3d_accessor_w_with_contents") { Allocator cpu_allocator = create_local_cpu_memory_allocator(); - GenericTensorAccessorW result - = create_3d_accessor_w_with_contents( - { + GenericTensorAccessorW result = create_3d_accessor_w_with_contents( + { { - {1, 4}, - {2, 3}, - {7, 2}, + {1, 4}, + {2, 3}, + {7, 2}, }, { - {9, 3}, - {4, 5}, - {0, 2}, + {9, 3}, + {4, 5}, + {0, 2}, }, - }, - cpu_allocator); + }, + cpu_allocator); - auto at = [&](nonnegative_int s, nonnegative_int r, nonnegative_int c) -> float { + auto at = + [&](nonnegative_int s, nonnegative_int r, nonnegative_int c) -> float { return result.at(FFOrdered{s, r, c}); }; @@ -83,33 +82,35 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("create_4d_accessor_w_with_contents") { Allocator cpu_allocator = create_local_cpu_memory_allocator(); - GenericTensorAccessorW result - = create_4d_accessor_w_with_contents( - { + GenericTensorAccessorW result = create_4d_accessor_w_with_contents( + { { - { - {2, 3}, - {7, 2}, - }, - { - {4, 5}, - {0, 2}, - }, + { + {2, 3}, + {7, 2}, + }, + { + {4, 5}, + {0, 2}, + }, }, { - { - {9, 6}, - {1, 2}, - }, - { - {8, 7}, - {3, 8}, - }, + { + {9, 6}, + {1, 2}, + }, + { + {8, 7}, + {3, 8}, + }, }, - }, - cpu_allocator); + }, + cpu_allocator); - auto at = [&](nonnegative_int s1, nonnegative_int s2, nonnegative_int r, nonnegative_int c) -> float { + auto at = [&](nonnegative_int s1, + nonnegative_int s2, + nonnegative_int r, + nonnegative_int c) -> float { return result.at(FFOrdered{s1, s2, r, c}); }; diff --git a/lib/kernels/test/src/kernels/format_accessor_contents.cc b/lib/kernels/test/src/kernels/format_accessor_contents.cc index a7f2bed5ba..f515f2495b 100644 --- a/lib/kernels/test/src/kernels/format_accessor_contents.cc +++ b/lib/kernels/test/src/kernels/format_accessor_contents.cc @@ -1,7 +1,7 @@ #include "kernels/format_accessor_contents.h" #include "internal/test_utils.h" -#include "kernels/local_cpu_allocator.h" #include "kernels/create_accessor_with_contents.h" +#include "kernels/local_cpu_allocator.h" #include using namespace ::FlexFlow; @@ -12,7 +12,8 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("accessor is 1d") { GenericTensorAccessorR accessor = - create_1d_accessor_r_with_contents({1, 2, 3, 2}, cpu_allocator); + create_1d_accessor_r_with_contents({1, 2, 3, 2}, + cpu_allocator); std::string correct = "[1 2 3 2]"; @@ -22,13 +23,14 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("accessor is 2d") { - GenericTensorAccessorR accessor = create_2d_accessor_r_with_contents( - { - {1, 2, 3, 5}, - {4, 3, 3, 2}, - {1, 1, 5, 8}, - }, - cpu_allocator); + GenericTensorAccessorR accessor = + create_2d_accessor_r_with_contents( + { + {1, 2, 3, 5}, + {4, 3, 3, 2}, + {1, 1, 5, 8}, + }, + cpu_allocator); std::string correct = "[\n" " [1 2 3 5]\n" @@ -42,25 +44,26 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("accessor is 3d") { - GenericTensorAccessorR accessor = create_3d_accessor_r_with_contents( - { - { - {1, 2, 3, 6}, - {4, 3, 3, 9}, - {1, 1, 5, 1}, - }, - { - {4, 1, 8, 7}, - {9, 4, 2, 4}, - {1, 0, 0, 6}, - }, + GenericTensorAccessorR accessor = + create_3d_accessor_r_with_contents( { - {2, 1, 1, 9}, - {1, 3, 6, 2}, - {1, 9, 8, 9}, + { + {1, 2, 3, 6}, + {4, 3, 3, 9}, + {1, 1, 5, 1}, + }, + { + {4, 1, 8, 7}, + {9, 4, 2, 4}, + {1, 0, 0, 6}, + }, + { + {2, 1, 1, 9}, + {1, 3, 6, 2}, + {1, 9, 8, 9}, + }, }, - }, - cpu_allocator); + cpu_allocator); std::string correct = "[\n" " [\n" diff --git a/lib/kernels/test/src/kernels/map_tensor_accessors.cc b/lib/kernels/test/src/kernels/map_tensor_accessors.cc index fcc59b7935..60d7c76904 100644 --- a/lib/kernels/test/src/kernels/map_tensor_accessors.cc +++ b/lib/kernels/test/src/kernels/map_tensor_accessors.cc @@ -1,6 +1,6 @@ -#include #include "kernels/map_tensor_accessors.h" #include "kernels/create_accessor_with_contents.h" +#include using namespace ::FlexFlow; @@ -10,8 +10,8 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorW accessor = create_2d_accessor_w_with_contents( { - {1, 3, 2}, - {2, 1, 5}, + {1, 3, 2}, + {2, 1, 5}, }, cpu_allocator); @@ -28,19 +28,20 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(at(1_n, 1_n) == 2); CHECK(at(1_n, 2_n) == 6); } - + TEST_CASE("map_tensor_accessor") { Allocator cpu_allocator = create_local_cpu_memory_allocator(); GenericTensorAccessorW input = create_2d_accessor_w_with_contents( { - {1, 3, 2}, - {2, 1, 5}, + {1, 3, 2}, + {2, 1, 5}, }, cpu_allocator); SUBCASE("function is not type changing") { - GenericTensorAccessorW result = map_tensor_accessor(input, [](float x) { return x + 1; }, cpu_allocator); + GenericTensorAccessorW result = map_tensor_accessor( + input, [](float x) { return x + 1; }, cpu_allocator); auto at = [&](nonnegative_int r, nonnegative_int c) -> float { return result.at(FFOrdered{r, c}); @@ -55,7 +56,8 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("function is type changing") { - GenericTensorAccessorW result = map_tensor_accessor(input, [](float x) -> bool { return x > 2; }, cpu_allocator); + GenericTensorAccessorW result = map_tensor_accessor( + input, [](float x) -> bool { return x > 2; }, cpu_allocator); auto at = [&](nonnegative_int r, nonnegative_int c) -> bool { return result.at(FFOrdered{r, c}); @@ -75,21 +77,26 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorW lhs = create_2d_accessor_w_with_contents( { - {1, 3, 2}, - {2, 1, 5}, + {1, 3, 2}, + {2, 1, 5}, }, cpu_allocator); SUBCASE("argument types are the same") { GenericTensorAccessorW rhs = create_2d_accessor_w_with_contents( { - {0, 2, 5}, - {3, 3, 8}, + {0, 2, 5}, + {3, 3, 8}, }, cpu_allocator); SUBCASE("function is not type changing") { - GenericTensorAccessorW result = map_tensor_accessors2(lhs, rhs, DataType::FLOAT, [](float l, float r) { return l + 2 * r; }, cpu_allocator); + GenericTensorAccessorW result = map_tensor_accessors2( + lhs, + rhs, + DataType::FLOAT, + [](float l, float r) { return l + 2 * r; }, + cpu_allocator); auto at = [&](nonnegative_int r, nonnegative_int c) -> float { return result.at(FFOrdered{r, c}); @@ -104,7 +111,12 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("function is type changing") { - GenericTensorAccessorW result = map_tensor_accessors2(lhs, rhs, DataType::BOOL, [](float l, float r) -> bool { return l > r; }, cpu_allocator); + GenericTensorAccessorW result = map_tensor_accessors2( + lhs, + rhs, + DataType::BOOL, + [](float l, float r) -> bool { return l > r; }, + cpu_allocator); auto at = [&](nonnegative_int r, nonnegative_int c) -> bool { return result.at(FFOrdered{r, c}); @@ -122,19 +134,20 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("argument types are not the same") { GenericTensorAccessorW rhs = create_2d_accessor_w_with_contents( { - {true, false, true}, - {true, false, false}, + {true, false, true}, + {true, false, false}, }, cpu_allocator); auto func = [](float l, bool r) -> double { if (r) { - return (- l); + return (-l); } else { return l * 2; } }; - GenericTensorAccessorW result = map_tensor_accessors2(lhs, rhs, DataType::DOUBLE, func, cpu_allocator); + GenericTensorAccessorW result = map_tensor_accessors2( + lhs, rhs, DataType::DOUBLE, func, cpu_allocator); auto at = [&](nonnegative_int r, nonnegative_int c) -> double { return result.at(FFOrdered{r, c}); diff --git a/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc b/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc index 0e69b3b937..a269cf4777 100644 --- a/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc +++ b/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc @@ -1,9 +1,9 @@ -#include #include "kernels/reduce_tensor_accessor.h" #include "internal/test_utils.h" +#include "kernels/create_accessor_with_contents.h" #include "kernels/format_accessor_contents.h" #include "test/utils/doctest/check_kv.h" -#include "kernels/create_accessor_with_contents.h" +#include using namespace ::FlexFlow; @@ -11,56 +11,58 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("reduce_tensor_accessor_in_dims") { Allocator cpu_allocator = create_local_cpu_memory_allocator(); - GenericTensorAccessorR accessor = create_3d_accessor_r_with_contents( - { - { - {1, 3, 2}, - {2, 1, 5}, - }, - { - {4, 2, 1}, - {8, 3, 6}, - }, - }, - cpu_allocator); + GenericTensorAccessorR accessor = + create_3d_accessor_r_with_contents( + { + { + {1, 3, 2}, + {2, 1, 5}, + }, + { + {4, 2, 1}, + {8, 3, 6}, + }, + }, + cpu_allocator); GenericTensorAccessorW result = reduce_tensor_accessor_in_dims( - accessor, - {ff_dim_t{0_n}, ff_dim_t{2_n}}, - cpu_allocator, - [](int32_t accum, int32_t x) { return x + accum; }); + accessor, + {ff_dim_t{0_n}, ff_dim_t{2_n}}, + cpu_allocator, + [](int32_t accum, int32_t x) { return x + accum; }); - GenericTensorAccessorW correct = create_1d_accessor_w_with_contents( - { - 1 + 3 + 2 + 4 + 2 + 1, - 2 + 1 + 5 + 8 + 3 + 6, - }, - cpu_allocator); + GenericTensorAccessorW correct = + create_1d_accessor_w_with_contents( + { + 1 + 3 + 2 + 4 + 2 + 1, + 2 + 1 + 5 + 8 + 3 + 6, + }, + cpu_allocator); CHECK_MESSAGE(accessors_are_equal(result, correct), check_kv("result =", format_accessor_w_contents(result)), check_kv("correct=", format_accessor_w_contents(correct))); } - TEST_CASE("reduce_tensor_accessor_in_all_dims") { Allocator cpu_allocator = create_local_cpu_memory_allocator(); - GenericTensorAccessorR accessor = create_3d_accessor_r_with_contents( - { - { - {1, 3, 2}, - {2, 1, 5}, - }, - { - {4, 2, 1}, - {8, 3, 6}, - }, - }, - cpu_allocator); + GenericTensorAccessorR accessor = + create_3d_accessor_r_with_contents( + { + { + {1, 3, 2}, + {2, 1, 5}, + }, + { + {4, 2, 1}, + {8, 3, 6}, + }, + }, + cpu_allocator); int32_t result = reduce_tensor_accessor_in_all_dims( - accessor, [](int32_t accum, int32_t elem) { return accum + elem; }); + accessor, [](int32_t accum, int32_t elem) { return accum + elem; }); int32_t correct = 1 + 3 + 2 + 2 + 1 + 5 + 4 + 2 + 1 + 8 + 3 + 6; CHECK(result == correct); diff --git a/lib/kernels/test/src/kernels/tensor_accessor_reductions.cc b/lib/kernels/test/src/kernels/tensor_accessor_reductions.cc index 744b875ee7..46f746161f 100644 --- a/lib/kernels/test/src/kernels/tensor_accessor_reductions.cc +++ b/lib/kernels/test/src/kernels/tensor_accessor_reductions.cc @@ -1,7 +1,7 @@ -#include +#include "kernels/tensor_accessor_reductions.h" #include "kernels/create_accessor_with_contents.h" #include "kernels/local_cpu_allocator.h" -#include "kernels/tensor_accessor_reductions.h" +#include using namespace ::FlexFlow; @@ -10,18 +10,19 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); SUBCASE("returns false if any elements are false") { - GenericTensorAccessorR accessor = create_3d_accessor_r_with_contents( - { - { - {true, true, true}, - {true, true, true}, - }, - { - {true, false, true}, - {true, true, true}, - }, - }, - cpu_allocator); + GenericTensorAccessorR accessor = + create_3d_accessor_r_with_contents( + { + { + {true, true, true}, + {true, true, true}, + }, + { + {true, false, true}, + {true, true, true}, + }, + }, + cpu_allocator); bool result = tensor_accessor_all(accessor); bool correct = false; @@ -30,12 +31,13 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("returns true if all elements are true") { - GenericTensorAccessorR accessor = create_2d_accessor_r_with_contents( - { - {true, true, true}, - {true, true, true}, - }, - cpu_allocator); + GenericTensorAccessorR accessor = + create_2d_accessor_r_with_contents( + { + {true, true, true}, + {true, true, true}, + }, + cpu_allocator); bool result = tensor_accessor_all(accessor); bool correct = true; @@ -44,12 +46,13 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("throw an error if the datatype is not bool") { - GenericTensorAccessorR accessor = create_2d_accessor_r_with_contents( - { - {1, 0, 1}, - {1, 1, 1}, - }, - cpu_allocator); + GenericTensorAccessorR accessor = + create_2d_accessor_r_with_contents( + { + {1, 0, 1}, + {1, 1, 1}, + }, + cpu_allocator); CHECK_THROWS(tensor_accessor_all(accessor)); } @@ -59,18 +62,19 @@ TEST_SUITE(FF_TEST_SUITE) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); SUBCASE("returns true if any elements are true") { - GenericTensorAccessorR accessor = create_3d_accessor_r_with_contents( - { - { - {false, false, false}, - {true, false, false}, - }, - { - {false, false, false}, - {false, false, false}, - }, - }, - cpu_allocator); + GenericTensorAccessorR accessor = + create_3d_accessor_r_with_contents( + { + { + {false, false, false}, + {true, false, false}, + }, + { + {false, false, false}, + {false, false, false}, + }, + }, + cpu_allocator); bool result = tensor_accessor_any(accessor); bool correct = true; @@ -79,12 +83,13 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("returns false if all elements are false") { - GenericTensorAccessorR accessor = create_2d_accessor_r_with_contents( - { - {false, false, false}, - {false, false, false}, - }, - cpu_allocator); + GenericTensorAccessorR accessor = + create_2d_accessor_r_with_contents( + { + {false, false, false}, + {false, false, false}, + }, + cpu_allocator); bool result = tensor_accessor_any(accessor); bool correct = false; @@ -93,12 +98,13 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("throw an error if the datatype is not bool") { - GenericTensorAccessorR accessor = create_2d_accessor_r_with_contents( - { - {1, 0, 1}, - {1, 1, 1}, - }, - cpu_allocator); + GenericTensorAccessorR accessor = + create_2d_accessor_r_with_contents( + { + {1, 0, 1}, + {1, 1, 1}, + }, + cpu_allocator); CHECK_THROWS(tensor_accessor_any(accessor)); } diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc index 3b024fdf55..f80c080f11 100644 --- a/lib/kernels/test/src/test_attention_kernel.cc +++ b/lib/kernels/test/src/test_attention_kernel.cc @@ -20,9 +20,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { ManagedFFStream managed_stream{}; ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true - ); + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc index 4ca8811b9b..dd98a36094 100644 --- a/lib/kernels/test/src/test_batch_matmul_kernel.cc +++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc @@ -16,9 +16,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { ManagedFFStream managed_stream{}; ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true - ); + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index 00a26c3303..534901daf2 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -14,9 +14,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { ManagedFFStream managed_stream{}; ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true - ); + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc index 6ce415d48c..f3a2a8153d 100644 --- a/lib/kernels/test/src/test_combine_kernel.cc +++ b/lib/kernels/test/src/test_combine_kernel.cc @@ -7,9 +7,8 @@ using namespace ::FlexFlow; TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Call Combine Forward and Backward Kernels") { ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true - ); + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc index b22add8905..397b5cdf90 100644 --- a/lib/kernels/test/src/test_concat_kernel.cc +++ b/lib/kernels/test/src/test_concat_kernel.cc @@ -7,9 +7,8 @@ using namespace ::FlexFlow; TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test concat kernel forward and backward") { ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true - ); + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index 1b224084f8..c4518293dd 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -21,9 +21,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { ManagedFFStream managed_stream{}; ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true - ); + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc index 98896cca18..14930e280b 100644 --- a/lib/kernels/test/src/test_flat_kernel.cc +++ b/lib/kernels/test/src/test_flat_kernel.cc @@ -9,9 +9,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true - ); + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); ManagedFFStream managed_stream{}; TensorShape input_shape = TensorShape{ diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc index 52389ea0f5..365fd3fb81 100644 --- a/lib/kernels/test/src/test_gather_kernels.cc +++ b/lib/kernels/test/src/test_gather_kernels.cc @@ -7,9 +7,8 @@ using namespace ::FlexFlow; TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Gather Forward and Backward Kernel") { ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true - ); + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc index 4f3b701bba..3e63294e78 100644 --- a/lib/kernels/test/src/test_layer_norm_kernels.cc +++ b/lib/kernels/test/src/test_layer_norm_kernels.cc @@ -23,9 +23,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { }; ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true - ); + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc index 099536ce0d..ed2d8dc2b6 100644 --- a/lib/kernels/test/src/test_managed_ff_stream.cc +++ b/lib/kernels/test/src/test_managed_ff_stream.cc @@ -7,9 +7,8 @@ using namespace ::FlexFlow; TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test ManagedFFStream") { ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true - ); + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc index 058622e5cb..bfe3c363e4 100644 --- a/lib/kernels/test/src/test_managed_per_device_ff_handle.cc +++ b/lib/kernels/test/src/test_managed_per_device_ff_handle.cc @@ -6,10 +6,10 @@ using namespace ::FlexFlow; TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test ManagedPerDeviceFFHandle") { ManagedPerDeviceFFHandle base_handle{ - /*num_ranks=*/1, - /*my_rank=*/0, - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true, + /*num_ranks=*/1, + /*my_rank=*/0, + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true, }; PerDeviceFFHandle const *base_handle_ptr = &base_handle.raw_handle(); @@ -26,10 +26,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { SUBCASE("move assignment operator") { SUBCASE("move assign to other") { ManagedPerDeviceFFHandle new_handle{ - /*num_ranks=*/1, - /*my_rank=*/0, - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true, + /*num_ranks=*/1, + /*my_rank=*/0, + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true, }; new_handle = std::move(base_handle); CHECK(&new_handle.raw_handle() == base_handle_ptr); diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc index 94ce8f4848..40a9eead53 100644 --- a/lib/kernels/test/src/test_partition_kernel.cc +++ b/lib/kernels/test/src/test_partition_kernel.cc @@ -8,9 +8,8 @@ using namespace ::FlexFlow; TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Partition Forward and Backward") { ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true - ); + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index 7691daf7a6..a999311b81 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -24,31 +24,30 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { PoolOp pool_type = PoolOp::MAX; ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true - ); + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); - Pool2DPerDeviceState state = - Kernels::Pool2D::init_kernel(/*handle=*/managed_handle.raw_handle(), - /*activation=*/std::nullopt, - /*input_w=*/input_w.int_from_positive_int(), - /*input_h=*/input_h.int_from_positive_int(), - /*input_c=*/input_c.int_from_positive_int(), - /*input_n=*/input_n.int_from_positive_int(), - /*output_w=*/output_w.int_from_positive_int(), - /*output_h=*/output_h.int_from_positive_int(), - /*output_c=*/output_c.int_from_positive_int(), - /*output_n=*/output_n.int_from_positive_int(), - /*pad_h=*/pad_h.unwrap_nonnegative(), - /*pad_w=*/pad_w.unwrap_nonnegative(), - /*kernel_h=*/kernel_h.int_from_positive_int(), - /*kernel_w=*/kernel_w.int_from_positive_int(), - /*stride_h=*/stride_h.int_from_positive_int(), - /*stride_w=*/stride_w.int_from_positive_int(), - /*pool_type=*/pool_type); + Pool2DPerDeviceState state = Kernels::Pool2D::init_kernel( + /*handle=*/managed_handle.raw_handle(), + /*activation=*/std::nullopt, + /*input_w=*/input_w.int_from_positive_int(), + /*input_h=*/input_h.int_from_positive_int(), + /*input_c=*/input_c.int_from_positive_int(), + /*input_n=*/input_n.int_from_positive_int(), + /*output_w=*/output_w.int_from_positive_int(), + /*output_h=*/output_h.int_from_positive_int(), + /*output_c=*/output_c.int_from_positive_int(), + /*output_n=*/output_n.int_from_positive_int(), + /*pad_h=*/pad_h.unwrap_nonnegative(), + /*pad_w=*/pad_w.unwrap_nonnegative(), + /*kernel_h=*/kernel_h.int_from_positive_int(), + /*kernel_w=*/kernel_w.int_from_positive_int(), + /*stride_h=*/stride_h.int_from_positive_int(), + /*stride_w=*/stride_w.int_from_positive_int(), + /*pool_type=*/pool_type); TensorShape input_shape = TensorShape{ TensorDims{FFOrdered{input_n, input_c, input_h, input_w}}, diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc index 16b03d34d9..e2c4c36a71 100644 --- a/lib/kernels/test/src/test_reduction_kernel.cc +++ b/lib/kernels/test/src/test_reduction_kernel.cc @@ -14,9 +14,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { }; ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true - ); + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index 95989776c1..5f58239a31 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -22,9 +22,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { }; ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true - ); + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); ManagedFFStream managed_stream{}; Allocator gpu_allocator = create_local_cuda_memory_allocator(); @@ -47,16 +46,18 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { } SUBCASE("backward_kernel") { - GenericTensorAccessorR output_grad = create_2d_accessor_r_with_contents( - { - {1, 2, 3}, - {4, 3, 3}, - {1, 3, 5}, - }, - gpu_allocator); - - GenericTensorAccessorR correct = create_1d_accessor_r_with_contents( - {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator); + GenericTensorAccessorR output_grad = + create_2d_accessor_r_with_contents( + { + {1, 2, 3}, + {4, 3, 3}, + {1, 3, 5}, + }, + gpu_allocator); + + GenericTensorAccessorR correct = + create_1d_accessor_r_with_contents( + {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator); GenericTensorAccessorW input_grad = gpu_allocator.allocate_tensor(input_shape); @@ -85,9 +86,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { }; ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true - ); + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); ManagedFFStream managed_stream{}; Allocator gpu_allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc index 8c851e877e..066db28a17 100644 --- a/lib/kernels/test/src/test_reshape_kernel.cc +++ b/lib/kernels/test/src/test_reshape_kernel.cc @@ -6,9 +6,8 @@ using namespace ::FlexFlow; TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Reshape Forward and Backward") { ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true - ); + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index b9f97bc5cd..6a0ad84a92 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -14,9 +14,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { TensorShape output_shape = input_shape; ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true - ); + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); @@ -61,9 +60,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { TensorShape output_shape = input_shape; ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true - ); + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); ManagedFFStream managed_stream{}; Allocator gpu_allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index dc8cb276ab..bf10b5c633 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -13,9 +13,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { nonnegative_int channels = 100_n; ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true - ); + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index d51d0e40f5..1c1c4d4d51 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -14,9 +14,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { coord_t num_blks = 1; ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true - ); + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index 06b5add3c7..8560d33e5b 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -13,9 +13,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { }; ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true - ); + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/local-execution/include/local-execution/local_task_argument_accessor.h b/lib/local-execution/include/local-execution/local_task_argument_accessor.h index d95545d1cc..184bf0b559 100644 --- a/lib/local-execution/include/local-execution/local_task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/local_task_argument_accessor.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H -#include "task-spec/task_argument_accessor.h" #include "task-spec/slot_tensor_type_id.dtg.h" +#include "task-spec/task_argument_accessor.h" #include #include diff --git a/lib/local-execution/include/local-execution/loss_functions.h b/lib/local-execution/include/local-execution/loss_functions.h index d625088be4..c75d4414de 100644 --- a/lib/local-execution/include/local-execution/loss_functions.h +++ b/lib/local-execution/include/local-execution/loss_functions.h @@ -16,10 +16,10 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_ #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_ -#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/loss_functions.h" #include "pcg/tensor_guid_t.dtg.h" #include "task-spec/loss_tensor_t.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "task-spec/task_invocation.dtg.h" #include "task-spec/task_signature.h" diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h index 7b08036059..e4a9c78743 100644 --- a/lib/local-execution/include/local-execution/optimizer.h +++ b/lib/local-execution/include/local-execution/optimizer.h @@ -1,10 +1,10 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_ #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_ -#include "task-spec/task_impl_function.dtg.h" #include "pcg/optimizer_attrs.dtg.h" #include "pcg/optimizers/adam_optimizer_attrs.dtg.h" #include "pcg/optimizers/sgd_optimizer_attrs.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "task-spec/task_invocation.dtg.h" #include "task-spec/task_signature.h" diff --git a/lib/local-execution/src/allocated_tensors.cc b/lib/local-execution/src/allocated_tensors.cc index d400b4f815..ffaeaf285f 100644 --- a/lib/local-execution/src/allocated_tensors.cc +++ b/lib/local-execution/src/allocated_tensors.cc @@ -35,7 +35,8 @@ bool are_allocated_forward_tensors_valid( if (!is_allocated_tensor_backing_valid( TensorTypeVariant{tensor_guid}, allocated_tensors.tensor_type_backings, - array_shape_from_tensor_shape(tensor_attrs.at(tensor_guid).shape))) { + array_shape_from_tensor_shape( + tensor_attrs.at(tensor_guid).shape))) { return false; } } else { @@ -58,8 +59,8 @@ bool are_allocated_gradient_tensors_valid( return false; } - ArrayShape tensor_guid_array_shape = - array_shape_from_tensor_shape(tensor_attrs.at(tensor_to_grad.first).shape); + ArrayShape tensor_guid_array_shape = array_shape_from_tensor_shape( + tensor_attrs.at(tensor_to_grad.first).shape); TensorTypeVariant gradient_tensor = TensorTypeVariant{tensor_to_grad.second}; if (is_allocated_tensor_backing_valid( @@ -100,8 +101,8 @@ bool are_allocated_optimizer_tensors_valid( return false; } - ArrayShape tensor_guid_array_shape = - array_shape_from_tensor_shape(tensor_attrs.at(tensor_to_optimizers.first).shape); + ArrayShape tensor_guid_array_shape = array_shape_from_tensor_shape( + tensor_attrs.at(tensor_to_optimizers.first).shape); for (optimizer_tensor_t const &optimizer_tensor : tensor_to_optimizers.second) { if (is_allocated_tensor_backing_valid( diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index 4b5ee0b782..3b1bb0fd2d 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -1,12 +1,12 @@ #include "local-execution/local_training_backing.h" #include "local-execution/loss_functions.h" #include "local-execution/optimizer.h" -#include "task-spec/task_signature_impl.h" #include "local-execution/unallocated_tensors.h" #include "pcg/computation_graph.h" #include "pcg/optimizer_attrs.h" #include "task-spec/op_task_to_task_invocation.h" #include "task-spec/task_invocation.h" +#include "task-spec/task_signature_impl.h" #include "utils/containers/contains.h" #include "utils/containers/contains_key.h" #include "utils/containers/get_only.h" diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc index 974e580b8e..c23159a85d 100644 --- a/lib/local-execution/src/loss_functions.cc +++ b/lib/local-execution/src/loss_functions.cc @@ -55,8 +55,7 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { auto logit_grad = acc.get_tensor_grad(LOGIT_GRAD); auto logit = acc.get_tensor(LOGIT); auto label = acc.get_loss_tensor(LABEL); - int batch_size = - logit.shape.at(legion_dim_t{1_n}).int_from_positive_int(); + int batch_size = logit.shape.at(legion_dim_t{1_n}).int_from_positive_int(); // assuming logit shape is [batch dim, num classes] LossFunction loss_type = get_loss_function(attrs); @@ -70,29 +69,26 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { // label shape is [batch dim, 1] auto scce_attrs = attrs.get(); size_t ndim = logit.shape.num_dims().unwrap_nonnegative(); - int num_classes = - logit.shape.at(legion_dim_t{0_n}).int_from_positive_int(); + int num_classes = logit.shape.at(legion_dim_t{0_n}).int_from_positive_int(); ASSERT(logit_grad.shape == logit.shape); int k = 1; if (scce_attrs.replace_labels) { k = logit.shape.at(legion_dim_t(nonnegative_int{ndim - 1})) .int_from_positive_int() / label.shape.at(legion_dim_t(nonnegative_int{ndim - 1})) - .int_from_positive_int(); // TODO FIXME something seems wrong here, - // isn't the numerator guaranteed to be 1? - // <--- this is not the case because of the - // potential parallel dim + .int_from_positive_int(); // TODO FIXME something seems wrong + // here, isn't the numerator guaranteed + // to be 1? + // <--- this is not the case because of + // the potential parallel dim } - ASSERT( - label.shape.sub_shape(legion_dim_t(1_n), std::nullopt) == - logit.shape.sub_shape(legion_dim_t(1_n), std::nullopt)); + ASSERT(label.shape.sub_shape(legion_dim_t(1_n), std::nullopt) == + logit.shape.sub_shape(legion_dim_t(1_n), std::nullopt)); ASSERT(k * label.shape.at(legion_dim_t(nonnegative_int{ndim - 1})) .int_from_positive_int() == logit.shape.at(legion_dim_t(nonnegative_int{ndim - 1})) .int_from_positive_int()); - ASSERT( - label.shape.at(legion_dim_t(0_n)).int_from_positive_int() == - 1); + ASSERT(label.shape.at(legion_dim_t(0_n)).int_from_positive_int() == 1); profile(sparse_categorical_crossentropy_loss_backward_kernel, profiling, diff --git a/lib/local-execution/src/task_registry.cc b/lib/local-execution/src/task_registry.cc index 0acc3d865d..ae3d97daa4 100644 --- a/lib/local-execution/src/task_registry.cc +++ b/lib/local-execution/src/task_registry.cc @@ -1,6 +1,6 @@ #include "local-execution/task_registry.h" -#include "task-spec/task_signature_impl.h" #include "pcg/computation_graph.h" +#include "task-spec/task_signature_impl.h" namespace FlexFlow { diff --git a/lib/local-execution/test/src/test_allocated_tensors.cc b/lib/local-execution/test/src/test_allocated_tensors.cc index 971b09356c..3242ca79ad 100644 --- a/lib/local-execution/test/src/test_allocated_tensors.cc +++ b/lib/local-execution/test/src/test_allocated_tensors.cc @@ -1,6 +1,6 @@ +#include "kernels/local_cpu_allocator.h" #include "local-execution/allocated_tensors.h" #include "local-execution/gradient_tensor_source.h" -#include "kernels/local_cpu_allocator.h" #include "local-execution/loss_tensor_source.h" #include "local-execution/optimizer_tensor_source.h" #include "pcg/computation_graph.dtg.h" @@ -29,16 +29,13 @@ TEST_SUITE(FF_TEST_SUITE) { tensor_guid_t dangling_tensor = tensor_guid_source.new_mock_tensor_guid(); TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_p, 10_p}}, - DataType::FLOAT}, + TensorShape{TensorDims{FFOrdered{16_p, 10_p}}, DataType::FLOAT}, CreateGrad::NO}; TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_p, 20_p}}, - DataType::FLOAT}, + TensorShape{TensorDims{FFOrdered{16_p, 20_p}}, DataType::FLOAT}, CreateGrad::NO}; TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_p, 30_p}}, - DataType::FLOAT}, + TensorShape{TensorDims{FFOrdered{16_p, 30_p}}, DataType::FLOAT}, CreateGrad::YES}; GenericTensorAccessorW tensor_backing_1 = diff --git a/lib/local-execution/test/src/test_e2e.cc b/lib/local-execution/test/src/test_e2e.cc index 2494ff1943..8827e0269d 100644 --- a/lib/local-execution/test/src/test_e2e.cc +++ b/lib/local-execution/test/src/test_e2e.cc @@ -18,13 +18,12 @@ using namespace ::FlexFlow; -bool did_loss_decrease( - GenericTensorAccessorR const &first_epoch, - GenericTensorAccessorR const &last_epoch -) { +bool did_loss_decrease(GenericTensorAccessorR const &first_epoch, + GenericTensorAccessorR const &last_epoch) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); - return tensor_accessor_all(compare_tensor_accessors_le(last_epoch, first_epoch, cpu_allocator)); + return tensor_accessor_all( + compare_tensor_accessors_le(last_epoch, first_epoch, cpu_allocator)); } TEST_SUITE(FF_CUDA_TEST_SUITE) { @@ -32,9 +31,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { // initialize runtime ManagedFFStream managed_stream{}; ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true - ); + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); Allocator allocator = create_local_cuda_memory_allocator(); @@ -48,32 +46,28 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { positive_int output_dim = 1_p; TensorShape output_tensor_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, output_dim}}, - DataType::FLOAT}; + TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT}; GenericTensorAccessorW label_tensor_backing = allocator.allocate_tensor(output_tensor_shape); AllocatedTensors allocated_tensors = AllocatedTensors{ /*tensor_type_backings=*/{ - {TensorTypeVariant{label_tensor}, label_tensor_backing}, - }, - /*gradient_mapping=*/{}, - /*optimizer_mapping*/{}, + {TensorTypeVariant{label_tensor}, label_tensor_backing}, + }, + /*gradient_mapping=*/{}, + /*optimizer_mapping*/ {}, }; // construct computation graph ComputationGraph computation_graph = make_empty_computation_graph(); TensorShape input_tensor_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, data_dim}}, - DataType::FLOAT}; + TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; TensorShape weight_shape_1 = TensorShape{ - TensorDims{FFOrdered{data_dim, hidden_dim}}, - DataType::FLOAT}; + TensorDims{FFOrdered{data_dim, hidden_dim}}, DataType::FLOAT}; TensorShape weight_shape_2 = TensorShape{ - TensorDims{FFOrdered{hidden_dim, output_dim}}, - DataType::FLOAT}; + TensorDims{FFOrdered{hidden_dim, output_dim}}, DataType::FLOAT}; LayerAddedResult inputs_layer = add_input_layer_with_grad(computation_graph, input_tensor_shape); @@ -162,16 +156,14 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { model_training_instance.forward(); model_training_instance.backward(); model_training_instance.update(); - loss_values.push_back( - copy_tensor_accessor_r( - model_training_instance.get_loss_tensor_accessor(), - cpu_allocator)); + loss_values.push_back(copy_tensor_accessor_r( + model_training_instance.get_loss_tensor_accessor(), cpu_allocator)); } // Assert that each sample in the batch has a lower loss in last epoch than // the first epoch GenericTensorAccessorR first_epoch_loss = loss_values.at(0); GenericTensorAccessorR last_epoch = loss_values.back(); - CHECK(did_loss_decrease( first_epoch_loss, last_epoch)); + CHECK(did_loss_decrease(first_epoch_loss, last_epoch)); } } diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc index 71148d06c1..42b88aa6bc 100644 --- a/lib/local-execution/test/src/test_local_cost_estimator.cc +++ b/lib/local-execution/test/src/test_local_cost_estimator.cc @@ -12,9 +12,8 @@ using namespace ::FlexFlow; TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("LocalCostEstimator") { ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true - ); + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ DeviceSpecific::create(managed_handle.raw_handle()), diff --git a/lib/local-execution/test/src/test_local_task_arg_accessor.cc b/lib/local-execution/test/src/test_local_task_arg_accessor.cc index e817b6fd8e..5c11010e2a 100644 --- a/lib/local-execution/test/src/test_local_task_arg_accessor.cc +++ b/lib/local-execution/test/src/test_local_task_arg_accessor.cc @@ -18,8 +18,7 @@ TEST_SUITE(FF_TEST_SUITE) { DataType dtype = DataType::FLOAT; TensorShape input_tensor_shape = TensorShape{ - TensorDims{ - FFOrdered{batch_size, seq_len, feature_size}}, + TensorDims{FFOrdered{batch_size, seq_len, feature_size}}, DataType::FLOAT, }; diff --git a/lib/local-execution/test/src/test_local_tensor_backing.cc b/lib/local-execution/test/src/test_local_tensor_backing.cc index df787fcd6f..bba0bd28ce 100644 --- a/lib/local-execution/test/src/test_local_tensor_backing.cc +++ b/lib/local-execution/test/src/test_local_tensor_backing.cc @@ -94,12 +94,10 @@ TEST_SUITE(FF_TEST_SUITE) { tensor_guid_source.new_mock_tensor_guid(); TensorAttrs allocated_tensor_attrs = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_p, 10_p}}, - DataType::FLOAT}, + TensorShape{TensorDims{FFOrdered{16_p, 10_p}}, DataType::FLOAT}, CreateGrad::NO}; TensorAttrs unallocated_tensor_attrs = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_p, 20_p}}, - DataType::FLOAT}, + TensorShape{TensorDims{FFOrdered{16_p, 20_p}}, DataType::FLOAT}, CreateGrad::YES}; GenericTensorAccessorW allocated_tensor_backing = diff --git a/lib/local-execution/test/src/test_loss_functions.cc b/lib/local-execution/test/src/test_loss_functions.cc index 5a9347e37b..d741d4d8d4 100644 --- a/lib/local-execution/test/src/test_loss_functions.cc +++ b/lib/local-execution/test/src/test_loss_functions.cc @@ -18,9 +18,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { // initialize runtime ManagedFFStream managed_stream{}; ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true - ); + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); Allocator allocator = create_local_cuda_memory_allocator(); @@ -36,11 +35,9 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { positive_int output_dim = 32_p; TensorShape output_tensor_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, output_dim}}, - DataType::FLOAT}; + TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT}; TensorShape reduced_tensor_shape = - TensorShape{TensorDims{FFOrdered{batch_size, 1_p}}, - DataType::FLOAT}; + TensorShape{TensorDims{FFOrdered{batch_size, 1_p}}, DataType::FLOAT}; GenericTensorAccessorW label_for_nonconfigurable_loss_attrs_backing = allocator.allocate_tensor(output_tensor_shape); @@ -58,12 +55,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { ComputationGraph computation_graph = make_empty_computation_graph(); TensorShape input_tensor_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, data_dim}}, - DataType::FLOAT}; + TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; TensorShape weight_shape = TensorShape{ - TensorDims{FFOrdered{data_dim, output_dim}}, - DataType::FLOAT}; + TensorDims{FFOrdered{data_dim, output_dim}}, DataType::FLOAT}; LayerAddedResult inputs_layer = add_input_layer(computation_graph, input_tensor_shape); diff --git a/lib/local-execution/test/src/test_task_registry.cc b/lib/local-execution/test/src/test_task_registry.cc index ea20eb0fa0..4bcfa7fe17 100644 --- a/lib/local-execution/test/src/test_task_registry.cc +++ b/lib/local-execution/test/src/test_task_registry.cc @@ -1,8 +1,8 @@ #include "doctest/doctest.h" #include "kernels/local_cuda_allocator.h" #include "local-execution/local_cost_estimator.h" -#include "task-spec/task_signature_impl.h" #include "pcg/computation_graph_builder.h" +#include "task-spec/task_signature_impl.h" #include "utils/fmt/optional.h" #include "utils/fmt/unordered_map.h" diff --git a/lib/local-execution/test/src/test_unallocated_tensors.cc b/lib/local-execution/test/src/test_unallocated_tensors.cc index 7a2650b447..0a0b99e61c 100644 --- a/lib/local-execution/test/src/test_unallocated_tensors.cc +++ b/lib/local-execution/test/src/test_unallocated_tensors.cc @@ -1,6 +1,6 @@ +#include "kernels/local_cpu_allocator.h" #include "local-execution/allocated_tensors.h" #include "local-execution/gradient_tensor_source.h" -#include "kernels/local_cpu_allocator.h" #include "local-execution/loss_tensor_source.h" #include "local-execution/optimizer_tensor_source.h" #include "local-execution/unallocated_tensors.h" @@ -38,16 +38,13 @@ TEST_SUITE(FF_TEST_SUITE) { optimizer_tensor_source.new_optimizer_tensor(); TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_p, 10_p}}, - DataType::FLOAT}, + TensorShape{TensorDims{FFOrdered{16_p, 10_p}}, DataType::FLOAT}, CreateGrad::NO}; TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_p, 20_p}}, - DataType::FLOAT}, + TensorShape{TensorDims{FFOrdered{16_p, 20_p}}, DataType::FLOAT}, CreateGrad::NO}; TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_p, 30_p}}, - DataType::FLOAT}, + TensorShape{TensorDims{FFOrdered{16_p, 30_p}}, DataType::FLOAT}, CreateGrad::YES}; GenericTensorAccessorW tensor_backing_1 = diff --git a/lib/local-execution/test/src/test_update.cc b/lib/local-execution/test/src/test_update.cc index 6ffe002f22..54c64e6b6c 100644 --- a/lib/local-execution/test/src/test_update.cc +++ b/lib/local-execution/test/src/test_update.cc @@ -16,9 +16,8 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { // initialize runtime configs ManagedFFStream managed_stream{}; ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true - ); + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); Allocator allocator = create_local_cuda_memory_allocator(); AllocatedTensors allocated_tensors = make_empty_allocated_tensors(); @@ -31,12 +30,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { positive_int output_dim = 32_p; TensorShape input_tensor_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, data_dim}}, - DataType::FLOAT}; + TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; TensorShape weight_shape = TensorShape{ - TensorDims{FFOrdered{data_dim, output_dim}}, - DataType::FLOAT}; + TensorDims{FFOrdered{data_dim, output_dim}}, DataType::FLOAT}; LayerAddedResult inputs_layer = add_input_layer(computation_graph, input_tensor_shape); diff --git a/lib/models/src/models/dlrm/dlrm.cc b/lib/models/src/models/dlrm/dlrm.cc index 5d56909fec..d1dd52b4da 100644 --- a/lib/models/src/models/dlrm/dlrm.cc +++ b/lib/models/src/models/dlrm/dlrm.cc @@ -143,17 +143,17 @@ ComputationGraph get_dlrm_computation_graph(DLRMConfig const &config) { /*input=*/dense_input, /*mlp_layers=*/config.dense_arch_layer_sizes); - std::vector emb_outputs = transform( - zip(config.embedding_size, sparse_inputs), - [&](std::pair const &combined_pair) - -> tensor_guid_t { - return create_dlrm_sparse_embedding_network( - /*cgb=*/cgb, - /*config=*/config, - /*input=*/combined_pair.second, - /*input_dim=*/combined_pair.first, - /*output_dim=*/config.embedding_dim); - }); + std::vector emb_outputs = + transform(zip(config.embedding_size, sparse_inputs), + [&](std::pair const &combined_pair) + -> tensor_guid_t { + return create_dlrm_sparse_embedding_network( + /*cgb=*/cgb, + /*config=*/config, + /*input=*/combined_pair.second, + /*input_dim=*/combined_pair.first, + /*output_dim=*/config.embedding_dim); + }); tensor_guid_t interacted_features = create_dlrm_interact_features( /*cgb=*/cgb, diff --git a/lib/op-attrs/include/op-attrs/datatype.h b/lib/op-attrs/include/op-attrs/datatype.h index 62f7ccd4f9..ad45dcb13c 100644 --- a/lib/op-attrs/include/op-attrs/datatype.h +++ b/lib/op-attrs/include/op-attrs/datatype.h @@ -13,58 +13,53 @@ template struct data_type_enum_to_class; template <> -struct data_type_enum_to_class - : type_identity {}; +struct data_type_enum_to_class : type_identity {}; template <> -struct data_type_enum_to_class - : type_identity {}; +struct data_type_enum_to_class : type_identity {}; template <> -struct data_type_enum_to_class - : type_identity {}; +struct data_type_enum_to_class : type_identity {}; template <> -struct data_type_enum_to_class - : type_identity {}; +struct data_type_enum_to_class : type_identity {}; template <> -struct data_type_enum_to_class - : type_identity {}; +struct data_type_enum_to_class : type_identity {}; template <> -struct data_type_enum_to_class - : type_identity {}; +struct data_type_enum_to_class : type_identity {}; template struct type_to_data_type_enum; template <> struct type_to_data_type_enum - : std::integral_constant {}; + : std::integral_constant {}; template <> struct type_to_data_type_enum - : std::integral_constant {}; + : std::integral_constant {}; template <> struct type_to_data_type_enum - : std::integral_constant {}; + : std::integral_constant {}; template <> struct type_to_data_type_enum - : std::integral_constant {}; + : std::integral_constant {}; template <> struct type_to_data_type_enum - : std::integral_constant {}; + : std::integral_constant {}; template <> struct type_to_data_type_enum - : std::integral_constant {}; + : std::integral_constant {}; template -inline constexpr DataType type_to_data_type_enum_v = type_to_data_type_enum::value; +inline constexpr DataType type_to_data_type_enum_v = + type_to_data_type_enum::value; template typename data_type_enum_to_class
::type cast_to(T t) { diff --git a/lib/op-attrs/include/op-attrs/initializers/kaiming_initializer_mode.h b/lib/op-attrs/include/op-attrs/initializers/kaiming_initializer_mode.h index f619f94e20..c5c967d5c2 100644 --- a/lib/op-attrs/include/op-attrs/initializers/kaiming_initializer_mode.h +++ b/lib/op-attrs/include/op-attrs/initializers/kaiming_initializer_mode.h @@ -14,7 +14,7 @@ namespace FlexFlow { * https://github.com/pytorch/pytorch/blob/bd019c0bb485904a99fb38589444b1461ab1e486/torch/nn/init.py#L345-L363 */ positive_int calculate_fan_for_mode(TensorDims const &dims, - KaimingInitializerMode mode); + KaimingInitializerMode mode); } // namespace FlexFlow diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h index bb374d98ee..435a962963 100644 --- a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h +++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h @@ -18,11 +18,11 @@ nonnegative_int num_shard_dims(ParallelTensorDims const &); ParallelTensorDimDegrees get_parallel_degrees(ParallelTensorDims const &); ParallelTensorDims lift_to_parallel(TensorDims const &); -ParallelTensorDims lift_to_parallel_with_degrees( - TensorDims const &, - SumDegree const &, - DiscardCopyDegree const &, - FFOrdered const &shard_degrees); +ParallelTensorDims + lift_to_parallel_with_degrees(TensorDims const &, + SumDegree const &, + DiscardCopyDegree const &, + FFOrdered const &shard_degrees); ParallelTensorDims lift_to_parallel_with_degrees(TensorDims const &, ParallelTensorDimDegrees const &); diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h index 96d9bfb06a..e366f99b8e 100644 --- a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h +++ b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h @@ -17,8 +17,7 @@ ShardParallelDim shard_dim_at_idx(ParallelTensorShape const &, relative_ff_dim_t); ShardParallelDim &shard_dim_at_idx(ParallelTensorShape &, relative_ff_dim_t); -FFOrdered - ff_ordered_shard_degrees(ParallelTensorShape const &); +FFOrdered ff_ordered_shard_degrees(ParallelTensorShape const &); std::optional try_get_shard_dim_at_idx(ParallelTensorShape const &, relative_ff_dim_t); @@ -26,11 +25,11 @@ std::optional ParallelTensorDimDegrees get_parallel_degrees(ParallelTensorShape const &); ParallelTensorShape lift_to_parallel(TensorShape const &); -ParallelTensorShape lift_to_parallel_with_degrees( - TensorShape const &, - SumDegree const &, - DiscardCopyDegree const &, - FFOrdered const &shard_degrees); +ParallelTensorShape + lift_to_parallel_with_degrees(TensorShape const &, + SumDegree const &, + DiscardCopyDegree const &, + FFOrdered const &shard_degrees); ParallelTensorShape lift_to_parallel_with_degrees(TensorShape const &, ParallelTensorDimDegrees const &); diff --git a/lib/op-attrs/include/op-attrs/replica_parallel_dim_set.h b/lib/op-attrs/include/op-attrs/replica_parallel_dim_set.h index 85cea57523..28c48620a9 100644 --- a/lib/op-attrs/include/op-attrs/replica_parallel_dim_set.h +++ b/lib/op-attrs/include/op-attrs/replica_parallel_dim_set.h @@ -9,7 +9,7 @@ namespace FlexFlow { ReplicaParallelDimSet empty_replica_parallel_dim_set(); positive_int get_degree_of_replica_type(ReplicaParallelDimSet const &, - ReplicaType); + ReplicaType); std::unordered_set get_replica_dims(ReplicaParallelDimSet const &); diff --git a/lib/op-attrs/src/op-attrs/datatype_value.cc b/lib/op-attrs/src/op-attrs/datatype_value.cc index dfb77dac5d..a4abde2cb4 100644 --- a/lib/op-attrs/src/op-attrs/datatype_value.cc +++ b/lib/op-attrs/src/op-attrs/datatype_value.cc @@ -24,12 +24,12 @@ DataTypeValue make_bool_data_type_value(bool value) { } DataType get_data_type_of_data_type_value(DataTypeValue value) { - return value.visit(overload { - [](float) { return DataType::FLOAT; }, - [](double) { return DataType::DOUBLE; }, - [](int32_t) { return DataType::INT32; }, - [](int64_t) { return DataType::INT64; }, - [](bool) { return DataType::BOOL; }, + return value.visit(overload{ + [](float) { return DataType::FLOAT; }, + [](double) { return DataType::DOUBLE; }, + [](int32_t) { return DataType::INT32; }, + [](int64_t) { return DataType::INT64; }, + [](bool) { return DataType::BOOL; }, }); } diff --git a/lib/op-attrs/src/op-attrs/initializers/kaiming_initializer_mode.cc b/lib/op-attrs/src/op-attrs/initializers/kaiming_initializer_mode.cc index 789903dc66..aee2256036 100644 --- a/lib/op-attrs/src/op-attrs/initializers/kaiming_initializer_mode.cc +++ b/lib/op-attrs/src/op-attrs/initializers/kaiming_initializer_mode.cc @@ -4,7 +4,7 @@ namespace FlexFlow { positive_int calculate_fan_for_mode(TensorDims const &dims, - KaimingInitializerMode mode) { + KaimingInitializerMode mode) { positive_int num_input_fmaps = dim_at_idx(dims, relative_ff_dim_t{0}); positive_int num_output_fmaps = dim_at_idx(dims, relative_ff_dim_t{1}); diff --git a/lib/op-attrs/src/op-attrs/ops/attention.cc b/lib/op-attrs/src/op-attrs/ops/attention.cc index c5678e7bde..5800f086ef 100644 --- a/lib/op-attrs/src/op-attrs/ops/attention.cc +++ b/lib/op-attrs/src/op-attrs/ops/attention.cc @@ -69,8 +69,7 @@ positive_int get_vSize(MultiHeadAttentionInputs const &inputs) { return inputs.value_size; } -positive_int - get_kvSeqLength(MultiHeadAttentionParallelInputs const &inputs) { +positive_int get_kvSeqLength(MultiHeadAttentionParallelInputs const &inputs) { return inputs.sequence_dim.size; } @@ -78,8 +77,7 @@ positive_int get_kvSeqLength(MultiHeadAttentionInputs const &inputs) { return inputs.sequence_length; } -positive_int - get_qoSeqLength(MultiHeadAttentionParallelInputs const &inputs) { +positive_int get_qoSeqLength(MultiHeadAttentionParallelInputs const &inputs) { return inputs.sequence_dim.size; // FIXME -- assumes only prefill } @@ -87,8 +85,7 @@ positive_int get_qoSeqLength(MultiHeadAttentionInputs const &inputs) { return inputs.sequence_length; // FIXME -- assumes only prefil } -positive_int - get_num_samples(MultiHeadAttentionParallelInputs const &inputs) { +positive_int get_num_samples(MultiHeadAttentionParallelInputs const &inputs) { return inputs.batch_dim.size; } diff --git a/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc b/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc index d11a8aba10..33c4987233 100644 --- a/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc +++ b/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc @@ -152,10 +152,9 @@ tl::expected ShardParallelDim output_p = p; positive_int output_discard_copy_degree = 1_p; - positive_int output_sum_degree = positive_int{ - get_total_parallel_degree(input_lhs) / - (output_b.degree * output_n.degree * output_p.degree) - }; + positive_int output_sum_degree = + positive_int{get_total_parallel_degree(input_lhs) / + (output_b.degree * output_n.degree * output_p.degree)}; ParallelTensorShape result = ParallelTensorShape{ ParallelTensorDims{ diff --git a/lib/op-attrs/src/op-attrs/ops/combine.cc b/lib/op-attrs/src/op-attrs/ops/combine.cc index c55bdc55bb..64e9316ea2 100644 --- a/lib/op-attrs/src/op-attrs/ops/combine.cc +++ b/lib/op-attrs/src/op-attrs/ops/combine.cc @@ -44,10 +44,10 @@ tl::expected } ParallelTensorShape output = input; - relative_ff_dim_t combine_dim = relative_ff_dim_t_from_ff_dim_t(attrs.combine_dim); + relative_ff_dim_t combine_dim = + relative_ff_dim_t_from_ff_dim_t(attrs.combine_dim); shard_dim_at_idx(output, combine_dim).degree = positive_int{ - shard_dim_at_idx(output, combine_dim).degree / attrs.combine_degree - }; + shard_dim_at_idx(output, combine_dim).degree / attrs.combine_degree}; return output; } diff --git a/lib/op-attrs/src/op-attrs/ops/concat.cc b/lib/op-attrs/src/op-attrs/ops/concat.cc index b41d1ffc32..aed118dd62 100644 --- a/lib/op-attrs/src/op-attrs/ops/concat.cc +++ b/lib/op-attrs/src/op-attrs/ops/concat.cc @@ -17,8 +17,7 @@ tl::expected get_output_shape(ConcatAttrs const &attrs, std::vector const &inputs) { auto get_non_axis_dims = [&](TensorShape const &s) { - std::map dim_sizes = - enumerate(ff_ordered(s.dims)); + std::map dim_sizes = enumerate(ff_ordered(s.dims)); dim_sizes.erase(attrs.axis); return dim_sizes; }; diff --git a/lib/op-attrs/src/op-attrs/ops/conv_2d.cc b/lib/op-attrs/src/op-attrs/ops/conv_2d.cc index af4b6cd898..2ac90c1c9c 100644 --- a/lib/op-attrs/src/op-attrs/ops/conv_2d.cc +++ b/lib/op-attrs/src/op-attrs/ops/conv_2d.cc @@ -51,9 +51,9 @@ TensorShape get_bias_shape(Conv2DAttrs const &attrs, } static positive_int calculate_output_size(positive_int input_size, - nonnegative_int padding_size, - positive_int kernel_size, - positive_int stride) { + nonnegative_int padding_size, + positive_int kernel_size, + positive_int stride) { int input_size_raw = input_size.int_from_positive_int(); int padding_raw = padding_size.unwrap_nonnegative(); int kernel_size_raw = kernel_size.int_from_positive_int(); diff --git a/lib/op-attrs/src/op-attrs/ops/linear.cc b/lib/op-attrs/src/op-attrs/ops/linear.cc index 578e9ce652..32791e81a9 100644 --- a/lib/op-attrs/src/op-attrs/ops/linear.cc +++ b/lib/op-attrs/src/op-attrs/ops/linear.cc @@ -209,8 +209,8 @@ tl::expected, std::string> get_initializers( InitializerAttrs projection_initializer = maybe_projection_initializer.value_or(projection_default_initializer); - positive_int fan_in = calculate_fan_for_mode( - projection_shape.dims, KaimingInitializerMode::FAN_IN); + positive_int fan_in = calculate_fan_for_mode(projection_shape.dims, + KaimingInitializerMode::FAN_IN); float bound = 1 / sqrtf(static_cast(fan_in.int_from_positive_int())); diff --git a/lib/op-attrs/src/op-attrs/ops/pool_2d.cc b/lib/op-attrs/src/op-attrs/ops/pool_2d.cc index c542d688b3..361216cce4 100644 --- a/lib/op-attrs/src/op-attrs/ops/pool_2d.cc +++ b/lib/op-attrs/src/op-attrs/ops/pool_2d.cc @@ -105,9 +105,9 @@ tl::expected } static positive_int calculate_output_size(positive_int input_size, - nonnegative_int padding_size, - positive_int kernel_size, - positive_int stride) { + nonnegative_int padding_size, + positive_int kernel_size, + positive_int stride) { int input_size_raw = input_size.int_from_positive_int(); int padding_raw = padding_size.unwrap_nonnegative(); int kernel_size_raw = kernel_size.int_from_positive_int(); diff --git a/lib/op-attrs/src/op-attrs/ops/reduction.cc b/lib/op-attrs/src/op-attrs/ops/reduction.cc index 007559a816..580d47b1e9 100644 --- a/lib/op-attrs/src/op-attrs/ops/reduction.cc +++ b/lib/op-attrs/src/op-attrs/ops/reduction.cc @@ -29,10 +29,9 @@ tl::expected } ParallelTensorShape output_shape = input_shape; - + output_shape.dims.replica_dims.sum_degree.value = positive_int{ - output_shape.dims.replica_dims.sum_degree.value / attrs.reduction_degree - }; + output_shape.dims.replica_dims.sum_degree.value / attrs.reduction_degree}; return output_shape; } diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc index 8a96bc25ba..dd5230f5a4 100644 --- a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc +++ b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc @@ -19,8 +19,7 @@ FFOrdered ff_ordered_shard_dims(ParallelTensorDims const &d) { return d.shard_dims; } -FFOrdered - ff_ordered_shard_degrees(ParallelTensorDims const &d) { +FFOrdered ff_ordered_shard_degrees(ParallelTensorDims const &d) { return transform(d.shard_dims, [](ShardParallelDim const &d) { return d.degree; }); } diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc index ff6debee4f..1b8f6f1dfa 100644 --- a/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc +++ b/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc @@ -52,8 +52,7 @@ ShardParallelDim &shard_dim_at_idx(ParallelTensorShape &s, return shard_dim_at_idx(s.dims, d); } -FFOrdered - ff_ordered_shard_degrees(ParallelTensorShape const &s) { +FFOrdered ff_ordered_shard_degrees(ParallelTensorShape const &s) { return ff_ordered_shard_degrees(s.dims); } @@ -133,8 +132,8 @@ ParallelDim get_parallel_dim_at_idx(ParallelTensorShape const &shape, [&](ReplicaType replica_type) { ReplicaParallelDimSet replicas = shape.dims.replica_dims; positive_int degree = (ReplicaType::SUM == replica_type - ? replicas.sum_degree.value - : replicas.discard_copy_degree.value); + ? replicas.sum_degree.value + : replicas.discard_copy_degree.value); return ParallelDim{ReplicaParallelDim{degree, replica_type}}; }}); } diff --git a/lib/op-attrs/src/op-attrs/replica_parallel_dim_set.cc b/lib/op-attrs/src/op-attrs/replica_parallel_dim_set.cc index 41fb988bf7..871a39f91f 100644 --- a/lib/op-attrs/src/op-attrs/replica_parallel_dim_set.cc +++ b/lib/op-attrs/src/op-attrs/replica_parallel_dim_set.cc @@ -8,7 +8,7 @@ ReplicaParallelDimSet empty_replica_parallel_dim_set() { } positive_int get_degree_of_replica_type(ReplicaParallelDimSet const &s, - ReplicaType replica_type) { + ReplicaType replica_type) { switch (replica_type) { case ReplicaType::SUM: return s.sum_degree.value; diff --git a/lib/op-attrs/test/src/op-attrs/ops/attention.cc b/lib/op-attrs/test/src/op-attrs/ops/attention.cc index a99fe167c7..a4f8cd62fd 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/attention.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/attention.cc @@ -188,10 +188,7 @@ TEST_SUITE(FF_TEST_SUITE) { positive_int o_seq_len, positive_int o_q) { return lift_to_parallel_with_degrees( - input_q, - o_sum, - o_eq, - FFOrdered{o_batch, o_seq_len, o_q}); + input_q, o_sum, o_eq, FFOrdered{o_batch, o_seq_len, o_q}); }; auto make_k = [&](SumDegree o_sum, @@ -200,10 +197,7 @@ TEST_SUITE(FF_TEST_SUITE) { positive_int o_seq_len, positive_int o_k) { return lift_to_parallel_with_degrees( - input_k, - o_sum, - o_eq, - FFOrdered{o_batch, o_seq_len, o_k}); + input_k, o_sum, o_eq, FFOrdered{o_batch, o_seq_len, o_k}); }; auto make_v = [&](SumDegree o_sum, @@ -212,10 +206,7 @@ TEST_SUITE(FF_TEST_SUITE) { positive_int o_seq_len, positive_int o_v) { return lift_to_parallel_with_degrees( - input_v, - o_sum, - o_eq, - FFOrdered{o_batch, o_seq_len, o_v}); + input_v, o_sum, o_eq, FFOrdered{o_batch, o_seq_len, o_v}); }; auto make_o = [&](SumDegree o_sum, @@ -224,10 +215,7 @@ TEST_SUITE(FF_TEST_SUITE) { positive_int o_seq_len, positive_int o_o) { return lift_to_parallel_with_degrees( - output, - o_sum, - o_eq, - FFOrdered{o_batch, o_seq_len, o_o}); + output, o_sum, o_eq, FFOrdered{o_batch, o_seq_len, o_o}); }; auto make_w = [&](SumDegree o_sum, @@ -242,20 +230,14 @@ TEST_SUITE(FF_TEST_SUITE) { DiscardCopyDegree o_eq, positive_int o_in_proj_channel) { return lift_to_parallel_with_degrees( - input_bias, - o_sum, - o_eq, - FFOrdered{o_in_proj_channel}); + input_bias, o_sum, o_eq, FFOrdered{o_in_proj_channel}); }; auto make_output_bias = [&](SumDegree o_sum, DiscardCopyDegree o_eq, positive_int o_out_proj_channel) { return lift_to_parallel_with_degrees( - output_bias, - o_sum, - o_eq, - FFOrdered{o_out_proj_channel}); + output_bias, o_sum, o_eq, FFOrdered{o_out_proj_channel}); }; SUBCASE("data parallelism") { diff --git a/lib/op-attrs/test/src/op-attrs/ops/cast.cc b/lib/op-attrs/test/src/op-attrs/ops/cast.cc index eeba779dfe..128d077a05 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/cast.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/cast.cc @@ -37,10 +37,7 @@ TEST_SUITE(FF_TEST_SUITE) { positive_int o_batch, positive_int o_features) { return lift_to_parallel_with_degrees( - input, - o_sum, - o_eq, - FFOrdered{o_batch, o_features}); + input, o_sum, o_eq, FFOrdered{o_batch, o_features}); }; auto make_output = [&](SumDegree o_sum, @@ -48,10 +45,7 @@ TEST_SUITE(FF_TEST_SUITE) { positive_int o_batch, positive_int o_outchannels) { return lift_to_parallel_with_degrees( - output, - o_sum, - o_eq, - FFOrdered{o_batch, o_outchannels}); + output, o_sum, o_eq, FFOrdered{o_batch, o_outchannels}); }; SumDegree sum_degree = SumDegree{2_p}; diff --git a/lib/op-attrs/test/src/op-attrs/ops/combine.cc b/lib/op-attrs/test/src/op-attrs/ops/combine.cc index 07520e7cce..d8844d9b30 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/combine.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/combine.cc @@ -37,7 +37,8 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected correct = [&] { ParallelTensorShape output = input; positive_int old_shard_degree = output.dims.shard_dims.at(dim).degree; - output.dims.shard_dims.at(dim).degree = positive_int{old_shard_degree / degree}; + output.dims.shard_dims.at(dim).degree = + positive_int{old_shard_degree / degree}; return output; }(); diff --git a/lib/op-attrs/test/src/op-attrs/ops/concat.cc b/lib/op-attrs/test/src/op-attrs/ops/concat.cc index ee1255161c..95fa7d67c7 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/concat.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/concat.cc @@ -145,8 +145,7 @@ TEST_SUITE(FF_TEST_SUITE) { }; TensorShape output_shape = TensorShape{ - TensorDims{FFOrdered{ - dim0_size, 14_p + 16_p + 18_p, dim2_size}}, + TensorDims{FFOrdered{dim0_size, 14_p + 16_p + 18_p, dim2_size}}, DataType::FLOAT, }; diff --git a/lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc b/lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc index 67b6bbadb8..56407c03f1 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc @@ -165,8 +165,7 @@ TEST_SUITE(FF_TEST_SUITE) { kernel, o_sum, o_eq, - FFOrdered{ - o_outchannels, o_inchannels, o_kernel_h, o_kernel_w}); + FFOrdered{o_outchannels, o_inchannels, o_kernel_h, o_kernel_w}); }; auto make_bias = [&](SumDegree o_sum, diff --git a/lib/op-attrs/test/src/op-attrs/ops/embedding.cc b/lib/op-attrs/test/src/op-attrs/ops/embedding.cc index 7d43b45dd0..e7cc2d6420 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/embedding.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/embedding.cc @@ -77,10 +77,7 @@ TEST_SUITE(FF_TEST_SUITE) { positive_int o_batch, positive_int o_outchannels) { return lift_to_parallel_with_degrees( - output, - o_sum, - o_eq, - FFOrdered{o_batch, o_outchannels}); + output, o_sum, o_eq, FFOrdered{o_batch, o_outchannels}); }; auto make_weights = [&](SumDegree o_sum, @@ -88,10 +85,7 @@ TEST_SUITE(FF_TEST_SUITE) { positive_int o_entries, positive_int o_outchannels) { return lift_to_parallel_with_degrees( - weights, - o_sum, - o_eq, - FFOrdered{o_entries, o_outchannels}); + weights, o_sum, o_eq, FFOrdered{o_entries, o_outchannels}); }; SUBCASE("data parallelism") { diff --git a/lib/op-attrs/test/src/op-attrs/ops/linear.cc b/lib/op-attrs/test/src/op-attrs/ops/linear.cc index 1ca936738b..61934fd1fe 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/linear.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/linear.cc @@ -131,10 +131,7 @@ TEST_SUITE(FF_TEST_SUITE) { positive_int o_extra_dim, positive_int o_channel) { return lift_to_parallel_with_degrees( - input, - o_sum, - o_eq, - FFOrdered{o_batch, o_extra_dim, o_channel}); + input, o_sum, o_eq, FFOrdered{o_batch, o_extra_dim, o_channel}); }; auto make_output = [&](SumDegree o_sum, @@ -143,10 +140,7 @@ TEST_SUITE(FF_TEST_SUITE) { positive_int o_extra_dim, positive_int o_channel) { return lift_to_parallel_with_degrees( - output, - o_sum, - o_eq, - FFOrdered{o_batch, o_extra_dim, o_channel}); + output, o_sum, o_eq, FFOrdered{o_batch, o_extra_dim, o_channel}); }; auto make_projection = [&](SumDegree o_sum, @@ -154,10 +148,7 @@ TEST_SUITE(FF_TEST_SUITE) { positive_int o_inchannel, positive_int o_outchannel) { return lift_to_parallel_with_degrees( - projection, - o_sum, - o_eq, - FFOrdered{o_inchannel, o_outchannel}); + projection, o_sum, o_eq, FFOrdered{o_inchannel, o_outchannel}); }; auto make_bias = [&](SumDegree o_sum, diff --git a/lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc b/lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc index 9a27aafa5b..fcb772d187 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc @@ -16,8 +16,8 @@ TEST_SUITE(FF_TEST_SUITE) { Activation activation = Activation::RELU; PoolOp op = PoolOp::AVG; - TensorDims input_dims = TensorDims{ - FFOrdered{input_n, input_c, input_h, input_w}}; + TensorDims input_dims = + TensorDims{FFOrdered{input_n, input_c, input_h, input_w}}; SUBCASE("input_h divisible by output_h && input_w divisible by output_w") { positive_int output_h = 5_p; diff --git a/lib/op-attrs/test/src/op-attrs/ops/reduction.cc b/lib/op-attrs/test/src/op-attrs/ops/reduction.cc index a480c840a3..7cfe205e36 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/reduction.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/reduction.cc @@ -35,7 +35,8 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected correct = [&] { ParallelTensorShape output = input; positive_int old_sum_degree = output.dims.replica_dims.sum_degree.value; - output.dims.replica_dims.sum_degree.value = positive_int{old_sum_degree / degree}; + output.dims.replica_dims.sum_degree.value = + positive_int{old_sum_degree / degree}; return output; }(); diff --git a/lib/op-attrs/test/src/op-attrs/tensor_dims.cc b/lib/op-attrs/test/src/op-attrs/tensor_dims.cc index 044b50fae2..7c559cf5a8 100644 --- a/lib/op-attrs/test/src/op-attrs/tensor_dims.cc +++ b/lib/op-attrs/test/src/op-attrs/tensor_dims.cc @@ -7,8 +7,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("tensor_dims_is_broadcastable_to(TensorDims, TensorDims)") { - TensorDims goal = - TensorDims{FFOrdered{1_p, 1_p, 4_p, 3_p}}; + TensorDims goal = TensorDims{FFOrdered{1_p, 1_p, 4_p, 3_p}}; SUBCASE("dims match") { bool result = tensor_dims_is_broadcastable_to(goal, goal); @@ -27,8 +26,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("curr only needs dim expansion") { - TensorDims curr = - TensorDims{FFOrdered{1_p, 1_p, 1_p, 3_p}}; + TensorDims curr = TensorDims{FFOrdered{1_p, 1_p, 1_p, 3_p}}; bool result = tensor_dims_is_broadcastable_to(curr, goal); bool correct = true; @@ -46,8 +44,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("curr needs invalid dim promotion") { - TensorDims curr = - TensorDims{FFOrdered{1_p, 1_p, 2_p, 3_p}}; + TensorDims curr = TensorDims{FFOrdered{1_p, 1_p, 2_p, 3_p}}; bool result = tensor_dims_is_broadcastable_to(curr, goal); bool correct = false; @@ -56,8 +53,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("num_dims(goal) < num_dims(curr)") { - TensorDims curr = - TensorDims{FFOrdered{1_p, 1_p, 10_p, 4_p, 3_p}}; + TensorDims curr = TensorDims{FFOrdered{1_p, 1_p, 10_p, 4_p, 3_p}}; bool result = tensor_dims_is_broadcastable_to(curr, goal); bool correct = false; @@ -72,8 +68,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorDims d2 = TensorDims{FFOrdered{10_p, 4_p, 1_p}}; SUBCASE("has target in inputs") { - TensorDims d3 = - TensorDims{FFOrdered{1_p, 1_p, 4_p, 3_p}}; + TensorDims d3 = TensorDims{FFOrdered{1_p, 1_p, 4_p, 3_p}}; std::optional result = get_broadcast_target_dims({d1, d2, d3}); @@ -83,8 +78,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("has no possible target") { - TensorDims d3 = - TensorDims{FFOrdered{1_p, 1_p, 1_p, 4_p}}; + TensorDims d3 = TensorDims{FFOrdered{1_p, 1_p, 1_p, 4_p}}; std::optional result = get_broadcast_target_dims({d1, d2, d3}); @@ -94,8 +88,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("has possible target, but not in inputs") { - TensorDims d3 = - TensorDims{FFOrdered{1_p, 1_p, 1_p, 4_p, 3_p}}; + TensorDims d3 = TensorDims{FFOrdered{1_p, 1_p, 1_p, 4_p, 3_p}}; TensorDims possible_target = TensorDims{FFOrdered{1_p, 1_p, 10_p, 4_p, 3_p}}; diff --git a/lib/pcg/include/pcg/machine_specification.h b/lib/pcg/include/pcg/machine_specification.h index 863d9909c0..48c6e9a7a6 100644 --- a/lib/pcg/include/pcg/machine_specification.h +++ b/lib/pcg/include/pcg/machine_specification.h @@ -11,9 +11,9 @@ namespace FlexFlow { positive_int get_num_gpus(MachineSpecification const &ms); positive_int get_num_cpus(MachineSpecification const &ms); positive_int get_num_devices(MachineSpecification const &ms, - DeviceType const &device_type); + DeviceType const &device_type); positive_int get_num_devices_per_node(MachineSpecification const &ms, - DeviceType const &device_type); + DeviceType const &device_type); bool is_valid_machine_space_coordinate(MachineSpecification const &ms, MachineSpaceCoordinate const &coord); diff --git a/lib/pcg/src/pcg/machine_specification.cc b/lib/pcg/src/pcg/machine_specification.cc index 08afa415af..3db949b99d 100644 --- a/lib/pcg/src/pcg/machine_specification.cc +++ b/lib/pcg/src/pcg/machine_specification.cc @@ -14,7 +14,7 @@ positive_int get_num_cpus(MachineSpecification const &ms) { } positive_int get_num_devices(MachineSpecification const &ms, - DeviceType const &device_type) { + DeviceType const &device_type) { switch (device_type) { case DeviceType::GPU: return get_num_gpus(ms); @@ -26,7 +26,7 @@ positive_int get_num_devices(MachineSpecification const &ms, } positive_int get_num_devices_per_node(MachineSpecification const &ms, - DeviceType const &device_type) { + DeviceType const &device_type) { switch (device_type) { case DeviceType::GPU: return ms.num_gpus_per_node; diff --git a/lib/pcg/src/pcg/machine_view.cc b/lib/pcg/src/pcg/machine_view.cc index 3afa73ca62..0fbb021a55 100644 --- a/lib/pcg/src/pcg/machine_view.cc +++ b/lib/pcg/src/pcg/machine_view.cc @@ -105,8 +105,8 @@ std::optional get_machine_space_coordinate( return mv_strides.at(i.unwrap_nonnegative()).unwrapped; }); - std::vector coeffs = scanl( - sizes, 1_p, std::multiplies()); + std::vector coeffs = + scanl(sizes, 1_p, std::multiplies()); nonnegative_int index = start_idx; for (auto [coeff, coord_point, stride] : diff --git a/lib/pcg/src/pcg/operator_task_space.cc b/lib/pcg/src/pcg/operator_task_space.cc index 36ad43f3d3..d612680de6 100644 --- a/lib/pcg/src/pcg/operator_task_space.cc +++ b/lib/pcg/src/pcg/operator_task_space.cc @@ -24,7 +24,8 @@ std::unordered_set std::vector> coordinate_ranges = transform(task.degrees, [&](positive_int num_points) { - return nonnegative_range(num_points.nonnegative_int_from_positive_int()); + return nonnegative_range( + num_points.nonnegative_int_from_positive_int()); }); std::unordered_set> raw_coordinates = diff --git a/lib/substitutions/src/substitutions/output_graph/materialize_operator_from_attrs_map.cc b/lib/substitutions/src/substitutions/output_graph/materialize_operator_from_attrs_map.cc index 1568b73162..cf5a1e17f9 100644 --- a/lib/substitutions/src/substitutions/output_graph/materialize_operator_from_attrs_map.cc +++ b/lib/substitutions/src/substitutions/output_graph/materialize_operator_from_attrs_map.cc @@ -33,8 +33,7 @@ PCGOperatorAttrs materialize_operator_from_attrs_map( switch (op_type) { case OperatorType::MULTIHEAD_ATTENTION: return PCGOperatorAttrs{MultiHeadAttentionAttrs{ - /*embed_dim=*/acc.get( - OperatorAttributeKey::EMBED_DIM), + /*embed_dim=*/acc.get(OperatorAttributeKey::EMBED_DIM), /*num_heads=*/ acc.get(OperatorAttributeKey::NUM_HEADS), /*kdim=*/acc.get(OperatorAttributeKey::KDIM), diff --git a/lib/task-spec/include/task-spec/generic_task_impl_function.h b/lib/task-spec/include/task-spec/generic_task_impl_function.h index b02f4d6beb..31bf132e4f 100644 --- a/lib/task-spec/include/task-spec/generic_task_impl_function.h +++ b/lib/task-spec/include/task-spec/generic_task_impl_function.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_GENERIC_TASK_IMPL_FUNCTION_H #define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_GENERIC_TASK_IMPL_FUNCTION_H -#include "task-spec/task_argument_accessor.h" #include "task-spec/device_specific_device_states.dtg.h" +#include "task-spec/task_argument_accessor.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/init_op_task_impl_function.h b/lib/task-spec/include/task-spec/init_op_task_impl_function.h index f98e972df8..f82d249df1 100644 --- a/lib/task-spec/include/task-spec/init_op_task_impl_function.h +++ b/lib/task-spec/include/task-spec/init_op_task_impl_function.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_INIT_OP_TASK_IMPL_FUNCTION_H #define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_INIT_OP_TASK_IMPL_FUNCTION_H -#include "task-spec/task_argument_accessor.h" #include "task-spec/device_specific_device_states.dtg.h" +#include "task-spec/task_argument_accessor.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/itask_argument_accessor.h b/lib/task-spec/include/task-spec/itask_argument_accessor.h index 1424b09b84..e7d1a81760 100644 --- a/lib/task-spec/include/task-spec/itask_argument_accessor.h +++ b/lib/task-spec/include/task-spec/itask_argument_accessor.h @@ -2,9 +2,9 @@ #define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_ITASK_ARGUMENT_ACCESSOR_H #include "kernels/allocation.h" -#include "task-spec/privilege_tensor_accessor.h" #include "task-spec/concrete_arg.h" #include "task-spec/op_task_signature.h" +#include "task-spec/privilege_tensor_accessor.h" #include "task-spec/tensor_type.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/ops/attention.h b/lib/task-spec/include/task-spec/ops/attention.h index 9b0179eeac..a8a444c9bf 100644 --- a/lib/task-spec/include/task-spec/ops/attention.h +++ b/lib/task-spec/include/task-spec/ops/attention.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_ATTENTION_H #define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_ATTENTION_H -#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/attention.h" #include "task-spec/op_task_invocation.h" +#include "task-spec/task_impl_function.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/ops/batch_matmul.h b/lib/task-spec/include/task-spec/ops/batch_matmul.h index e0dc01d3f1..a50d1889e1 100644 --- a/lib/task-spec/include/task-spec/ops/batch_matmul.h +++ b/lib/task-spec/include/task-spec/ops/batch_matmul.h @@ -1,10 +1,10 @@ #ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_BATCH_MATMUL_H #define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_BATCH_MATMUL_H -#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/batch_matmul_attrs.dtg.h" #include "task-spec/op_task_invocation.h" #include "task-spec/op_task_signature.h" +#include "task-spec/task_impl_function.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/ops/batch_norm.h b/lib/task-spec/include/task-spec/ops/batch_norm.h index 081b60318f..bab6a4404a 100644 --- a/lib/task-spec/include/task-spec/ops/batch_norm.h +++ b/lib/task-spec/include/task-spec/ops/batch_norm.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_BATCH_NORM_H #define _FLEXFLOW_BATCH_NORM_H -#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/batch_norm_attrs.dtg.h" #include "task-spec/op_task_invocation.h" +#include "task-spec/task_impl_function.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/ops/cast.h b/lib/task-spec/include/task-spec/ops/cast.h index 990624b0e3..dadc8f8c74 100644 --- a/lib/task-spec/include/task-spec/ops/cast.h +++ b/lib/task-spec/include/task-spec/ops/cast.h @@ -15,9 +15,9 @@ #ifndef _FLEXFLOW_CAST_H #define _FLEXFLOW_CAST_H -#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/cast_attrs.dtg.h" #include "task-spec/op_task_invocation.h" +#include "task-spec/task_impl_function.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/ops/combine.h b/lib/task-spec/include/task-spec/ops/combine.h index be16379f36..ea7b3ed365 100644 --- a/lib/task-spec/include/task-spec/ops/combine.h +++ b/lib/task-spec/include/task-spec/ops/combine.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_COMBINE_H #define _FLEXFLOW_COMBINE_H -#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/combine_attrs.dtg.h" #include "task-spec/op_task_invocation.h" +#include "task-spec/task_impl_function.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/ops/concat.h b/lib/task-spec/include/task-spec/ops/concat.h index 6c7adf76ea..4e7cfef629 100644 --- a/lib/task-spec/include/task-spec/ops/concat.h +++ b/lib/task-spec/include/task-spec/ops/concat.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_CONCAT_H #define _FLEXFLOW_CONCAT_H -#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/concat_attrs.dtg.h" #include "task-spec/op_task_invocation.h" +#include "task-spec/task_impl_function.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/ops/conv_2d.h b/lib/task-spec/include/task-spec/ops/conv_2d.h index b7fda64961..1efb165d55 100644 --- a/lib/task-spec/include/task-spec/ops/conv_2d.h +++ b/lib/task-spec/include/task-spec/ops/conv_2d.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_CONV_2D_H #define _FLEXFLOW_CONV_2D_H -#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/conv_2d_attrs.dtg.h" #include "task-spec/op_task_invocation.h" +#include "task-spec/task_impl_function.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/ops/dropout.h b/lib/task-spec/include/task-spec/ops/dropout.h index 1801b63123..931e3e591e 100644 --- a/lib/task-spec/include/task-spec/ops/dropout.h +++ b/lib/task-spec/include/task-spec/ops/dropout.h @@ -1,10 +1,10 @@ #ifndef _FLEXFLOW_DROPOUT_H #define _FLEXFLOW_DROPOUT_H -#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/dropout_attrs.dtg.h" #include "task-spec/op_task_invocation.h" #include "task-spec/task_id_t.dtg.h" +#include "task-spec/task_impl_function.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/ops/element_binary.h b/lib/task-spec/include/task-spec/ops/element_binary.h index 57af54522d..2bd8c5dde7 100644 --- a/lib/task-spec/include/task-spec/ops/element_binary.h +++ b/lib/task-spec/include/task-spec/ops/element_binary.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_ELEMENT_BINARY_H #define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_ELEMENT_BINARY_H +#include "op-attrs/ops/element_binary_attrs.dtg.h" #include "task-spec/task_impl_function.dtg.h" #include "task-spec/task_signature_impl.h" -#include "op-attrs/ops/element_binary_attrs.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/ops/element_unary.h b/lib/task-spec/include/task-spec/ops/element_unary.h index f6dcd41455..5c88871ee7 100644 --- a/lib/task-spec/include/task-spec/ops/element_unary.h +++ b/lib/task-spec/include/task-spec/ops/element_unary.h @@ -1,9 +1,9 @@ #ifndef _ELEMENT_UNARY_H #define _ELEMENT_UNARY_H -#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/element_unary_attrs.dtg.h" #include "task-spec/op_task_invocation.h" +#include "task-spec/task_impl_function.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/ops/embedding.h b/lib/task-spec/include/task-spec/ops/embedding.h index 3a80d38398..27ade01cfa 100644 --- a/lib/task-spec/include/task-spec/ops/embedding.h +++ b/lib/task-spec/include/task-spec/ops/embedding.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_EMBEDDING_H #define _FLEXFLOW_EMBEDDING_H -#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/embedding_attrs.dtg.h" #include "task-spec/op_task_invocation.h" +#include "task-spec/task_impl_function.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/ops/flat.h b/lib/task-spec/include/task-spec/ops/flat.h index 6ac72ccd6b..3a02965d3b 100644 --- a/lib/task-spec/include/task-spec/ops/flat.h +++ b/lib/task-spec/include/task-spec/ops/flat.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_FLAT_H #define _FLEXFLOW_FLAT_H -#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/flat_attrs.dtg.h" #include "task-spec/op_task_invocation.h" +#include "task-spec/task_impl_function.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/ops/gather.h b/lib/task-spec/include/task-spec/ops/gather.h index c5ccc4ccdb..f800173f20 100644 --- a/lib/task-spec/include/task-spec/ops/gather.h +++ b/lib/task-spec/include/task-spec/ops/gather.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_GATHER_H #define _FLEXFLOW_GATHER_H -#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/gather_attrs.dtg.h" #include "task-spec/op_task_invocation.h" +#include "task-spec/task_impl_function.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/ops/layer_norm.h b/lib/task-spec/include/task-spec/ops/layer_norm.h index 81af0c360f..ad418826f2 100644 --- a/lib/task-spec/include/task-spec/ops/layer_norm.h +++ b/lib/task-spec/include/task-spec/ops/layer_norm.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_RUNTIME_SRC_OPS_LAYER_NORM_H #define _FLEXFLOW_RUNTIME_SRC_OPS_LAYER_NORM_H -#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/layer_norm_attrs.dtg.h" #include "task-spec/op_task_invocation.h" +#include "task-spec/task_impl_function.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/ops/linear.h b/lib/task-spec/include/task-spec/ops/linear.h index 69197fd627..d3c188a2c4 100644 --- a/lib/task-spec/include/task-spec/ops/linear.h +++ b/lib/task-spec/include/task-spec/ops/linear.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_LINEAR_H #define _FLEXFLOW_LINEAR_H -#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/linear_attrs.dtg.h" #include "task-spec/op_task_invocation.h" +#include "task-spec/task_impl_function.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/ops/pool_2d.h b/lib/task-spec/include/task-spec/ops/pool_2d.h index a3601e8800..fbecd0e96f 100644 --- a/lib/task-spec/include/task-spec/ops/pool_2d.h +++ b/lib/task-spec/include/task-spec/ops/pool_2d.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_POOL_2D_H #define _FLEXFLOW_POOL_2D_H -#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/pool_2d_attrs.dtg.h" #include "task-spec/op_task_invocation.h" +#include "task-spec/task_impl_function.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/ops/reduce.h b/lib/task-spec/include/task-spec/ops/reduce.h index e44c0f283f..ffcf66e752 100644 --- a/lib/task-spec/include/task-spec/ops/reduce.h +++ b/lib/task-spec/include/task-spec/ops/reduce.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H #define _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H -#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/reduce_attrs.dtg.h" #include "task-spec/op_task_invocation.h" +#include "task-spec/task_impl_function.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/ops/reduction.h b/lib/task-spec/include/task-spec/ops/reduction.h index cba90c37bb..5ddf292672 100644 --- a/lib/task-spec/include/task-spec/ops/reduction.h +++ b/lib/task-spec/include/task-spec/ops/reduction.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_REDUCTION_H #define _FLEXFLOW_REDUCTION_H -#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/reduction_attrs.dtg.h" #include "task-spec/op_task_invocation.h" +#include "task-spec/task_impl_function.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/ops/repartition.h b/lib/task-spec/include/task-spec/ops/repartition.h index f43cf13179..dfc42c54e5 100644 --- a/lib/task-spec/include/task-spec/ops/repartition.h +++ b/lib/task-spec/include/task-spec/ops/repartition.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_PARTITION_H #define _FLEXFLOW_PARTITION_H -#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/repartition_attrs.dtg.h" #include "task-spec/op_task_invocation.h" +#include "task-spec/task_impl_function.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/ops/replicate.h b/lib/task-spec/include/task-spec/ops/replicate.h index 0086dad741..18f6f74b19 100644 --- a/lib/task-spec/include/task-spec/ops/replicate.h +++ b/lib/task-spec/include/task-spec/ops/replicate.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_REPLICATE_H #define _FLEXFLOW_REPLICATE_H -#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/replicate_attrs.dtg.h" #include "task-spec/op_task_invocation.h" +#include "task-spec/task_impl_function.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/ops/reshape.h b/lib/task-spec/include/task-spec/ops/reshape.h index f192d83b9a..29d29ae84c 100644 --- a/lib/task-spec/include/task-spec/ops/reshape.h +++ b/lib/task-spec/include/task-spec/ops/reshape.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_RESHAPE_H #define _FLEXFLOW_RESHAPE_H -#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/reshape_attrs.dtg.h" #include "task-spec/op_task_invocation.h" +#include "task-spec/task_impl_function.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/ops/reverse.h b/lib/task-spec/include/task-spec/ops/reverse.h index bb123b63f5..7c91f91c0b 100644 --- a/lib/task-spec/include/task-spec/ops/reverse.h +++ b/lib/task-spec/include/task-spec/ops/reverse.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_REVERSE_H_ #define _FLEXFLOW_REVERSE_H_ -#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/reverse_attrs.dtg.h" #include "task-spec/op_task_invocation.h" +#include "task-spec/task_impl_function.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/ops/softmax.h b/lib/task-spec/include/task-spec/ops/softmax.h index 528dd5da0b..8f99c2658a 100644 --- a/lib/task-spec/include/task-spec/ops/softmax.h +++ b/lib/task-spec/include/task-spec/ops/softmax.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_SOFTMAX_H #define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_SOFTMAX_H -#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/softmax_attrs.dtg.h" #include "task-spec/op_task_invocation.h" +#include "task-spec/task_impl_function.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/ops/split.h b/lib/task-spec/include/task-spec/ops/split.h index ed92f2925e..1aa8609011 100644 --- a/lib/task-spec/include/task-spec/ops/split.h +++ b/lib/task-spec/include/task-spec/ops/split.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_SPLIT_H #define _FLEXFLOW_SPLIT_H -#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/split_attrs.dtg.h" #include "task-spec/op_task_invocation.h" +#include "task-spec/task_impl_function.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/ops/topk.h b/lib/task-spec/include/task-spec/ops/topk.h index 8afe98d568..33f2dbc5d7 100644 --- a/lib/task-spec/include/task-spec/ops/topk.h +++ b/lib/task-spec/include/task-spec/ops/topk.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_TOPK_H_ #define _FLEXFLOW_TOPK_H_ -#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/topk_attrs.dtg.h" #include "task-spec/op_task_invocation.h" +#include "task-spec/task_impl_function.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/ops/transpose.h b/lib/task-spec/include/task-spec/ops/transpose.h index dec29f4b36..7762f440cd 100644 --- a/lib/task-spec/include/task-spec/ops/transpose.h +++ b/lib/task-spec/include/task-spec/ops/transpose.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_TRANSPOSE_H_ #define _FLEXFLOW_TRANSPOSE_H_ -#include "task-spec/task_impl_function.dtg.h" #include "op-attrs/ops/transpose_attrs.dtg.h" #include "task-spec/op_task_invocation.h" +#include "task-spec/task_impl_function.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/task_argument_accessor.h b/lib/task-spec/include/task-spec/task_argument_accessor.h index 2cac3a5dd8..c1c42e09a3 100644 --- a/lib/task-spec/include/task-spec/task_argument_accessor.h +++ b/lib/task-spec/include/task-spec/task_argument_accessor.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TASK_ARGUMENT_ACCESSOR_H #define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TASK_ARGUMENT_ACCESSOR_H -#include "task-spec/itask_argument_accessor.h" #include "task-spec/device_specific.h" +#include "task-spec/itask_argument_accessor.h" #include "task-spec/per_device_op_state.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/task_signature_impl.h b/lib/task-spec/include/task-spec/task_signature_impl.h index ee093c7d23..fcf9b346cf 100644 --- a/lib/task-spec/include/task-spec/task_signature_impl.h +++ b/lib/task-spec/include/task-spec/task_signature_impl.h @@ -1,10 +1,10 @@ #ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TASK_SIGNATURE_IMPL_H #define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TASK_SIGNATURE_IMPL_H -#include "task-spec/task_signature_impl.dtg.h" #include "op-attrs/computation_graph_op_attrs.h" #include "task-spec/op_task_invocation.h" #include "task-spec/task_id_t.dtg.h" +#include "task-spec/task_signature_impl.dtg.h" namespace FlexFlow { diff --git a/lib/task-spec/src/task-spec/ops/layer_norm.cc b/lib/task-spec/src/task-spec/ops/layer_norm.cc index 7e6c5062e2..8db2281bcf 100644 --- a/lib/task-spec/src/task-spec/ops/layer_norm.cc +++ b/lib/task-spec/src/task-spec/ops/layer_norm.cc @@ -129,7 +129,8 @@ static DeviceSpecificDeviceStates num_replicas *= input.shape.at(legion_dim_t{i}); } positive_int effective_num_elements = M; - positive_int effective_batch_size = positive_int{input.shape.num_elements() / M}; + positive_int effective_batch_size = + positive_int{input.shape.num_elements() / M}; LayerNormPerDeviceState per_device_state = init_kernel(handle, diff --git a/lib/task-spec/src/task-spec/ops/linear.cc b/lib/task-spec/src/task-spec/ops/linear.cc index 3bf8080877..5e56ccdc1b 100644 --- a/lib/task-spec/src/task-spec/ops/linear.cc +++ b/lib/task-spec/src/task-spec/ops/linear.cc @@ -1,7 +1,7 @@ #include "task-spec/ops/linear.h" #include "kernels/linear_kernels.h" -#include "task-spec/task_argument_accessor.h" #include "op-attrs/ff_dim_t.h" +#include "task-spec/task_argument_accessor.h" #include "utils/exception.h" #include "utils/hash-utils.h" diff --git a/lib/task-spec/src/task-spec/ops/split.cc b/lib/task-spec/src/task-spec/ops/split.cc index aa3184c999..145a9b58a3 100644 --- a/lib/task-spec/src/task-spec/ops/split.cc +++ b/lib/task-spec/src/task-spec/ops/split.cc @@ -48,7 +48,8 @@ static std::pair calc_block_size(ArrayShape const &array_shape, ff_dim_t axis) { positive_int num_blocks = 1_p; positive_int block_size = 1_p; - for (nonnegative_int d : nonnegative_range(array_shape.num_elements().nonnegative_int_from_positive_int())) { + for (nonnegative_int d : nonnegative_range( + array_shape.num_elements().nonnegative_int_from_positive_int())) { if (d <= axis.value) { block_size *= array_shape.at(legion_dim_t{d}); } else { diff --git a/lib/task-spec/src/task-spec/ops/topk.cc b/lib/task-spec/src/task-spec/ops/topk.cc index ea2d855bf6..bdf92d8487 100644 --- a/lib/task-spec/src/task-spec/ops/topk.cc +++ b/lib/task-spec/src/task-spec/ops/topk.cc @@ -104,7 +104,8 @@ static std::optional auto indices = acc.get_tensor(INDICES); positive_int length = input_grad.shape.at(legion_dim_t{0_n}); - positive_int batch_size = positive_int{input_grad.shape.num_elements() / length}; + positive_int batch_size = + positive_int{input_grad.shape.num_elements() / length}; return profile(backward_kernel, profiling, diff --git a/lib/task-spec/test/src/task-spec/arg_ref.cc b/lib/task-spec/test/src/task-spec/arg_ref.cc index dcc2e9e580..5c331a1d71 100644 --- a/lib/task-spec/test/src/task-spec/arg_ref.cc +++ b/lib/task-spec/test/src/task-spec/arg_ref.cc @@ -1,22 +1,22 @@ -#include #include "task-spec/arg_ref.h" +#include #include using namespace ::FlexFlow; -enum class ExampleLabelType { +enum class ExampleLabelType { STRING, }; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("ArgRefSpec::holds") { - ArgRefSpec arg_ref_spec = ArgRefSpec::create( - ArgRef{ExampleLabelType::STRING} - ); + ArgRefSpec arg_ref_spec = + ArgRefSpec::create( + ArgRef{ExampleLabelType::STRING}); SUBCASE("returns true if the type matches the ArgRef type") { bool result = arg_ref_spec.holds(); - bool correct = true; + bool correct = true; CHECK(result == correct); } diff --git a/lib/utils/src/utils/positive_int/positive_int.cc b/lib/utils/src/utils/positive_int/positive_int.cc index 70233e74d8..93d4d17148 100644 --- a/lib/utils/src/utils/positive_int/positive_int.cc +++ b/lib/utils/src/utils/positive_int/positive_int.cc @@ -3,21 +3,16 @@ namespace FlexFlow { -positive_int::positive_int(int value) - : value_(value) -{ +positive_int::positive_int(int value) : value_(value) { this->check_invariant(); } -positive_int::positive_int(size_t value) - : value_(value) -{ +positive_int::positive_int(size_t value) : value_(value) { this->check_invariant(); } positive_int::positive_int(nonnegative_int value) - : value_(value.unwrap_nonnegative()) -{ + : value_(value.unwrap_nonnegative()) { this->check_invariant(); } @@ -196,7 +191,6 @@ nonnegative_int positive_int::operator*(nonnegative_int other) const { return other * *this; } - nonnegative_int operator*(nonnegative_int lhs, positive_int rhs) { return lhs * rhs.nonnegative_int_from_positive_int(); } @@ -247,10 +241,10 @@ void positive_int::check_invariant() const { } positive_int operator""_p(unsigned long long int x) { - ASSERT(x <= static_cast(std::numeric_limits::max())); + ASSERT(x <= + static_cast(std::numeric_limits::max())); return positive_int{static_cast(x)}; - } } // namespace FlexFlow @@ -268,8 +262,7 @@ void adl_serializer<::FlexFlow::positive_int>::to_json( } // namespace nlohmann namespace rc { -Gen<::FlexFlow::positive_int> - Arbitrary<::FlexFlow::positive_int>::arbitrary() { +Gen<::FlexFlow::positive_int> Arbitrary<::FlexFlow::positive_int>::arbitrary() { return gen::construct<::FlexFlow::positive_int>(gen::positive()); } } // namespace rc diff --git a/lib/utils/test/src/utils/containers/sum.cc b/lib/utils/test/src/utils/containers/sum.cc index 2e335b1051..2beaee6526 100644 --- a/lib/utils/test/src/utils/containers/sum.cc +++ b/lib/utils/test/src/utils/containers/sum.cc @@ -1,7 +1,7 @@ #include "utils/containers/sum.h" +#include "utils/positive_int/positive_int.h" #include #include -#include "utils/positive_int/positive_int.h" using namespace ::FlexFlow; @@ -28,16 +28,17 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("sum(std::vector)") { SUBCASE("returns the sum if the input is not empty") { - std::vector input = {3_p, 9_p, 3_p}; + std::vector input = {3_p, 9_p, 3_p}; positive_int result = sum(input); positive_int correct = 15_p; CHECK(result == correct); - } + } - SUBCASE("throws an error if the input is empty, as then 0 should be returned") { - std::vector input = {}; + SUBCASE( + "throws an error if the input is empty, as then 0 should be returned") { + std::vector input = {}; CHECK_THROWS(sum(input)); } diff --git a/lib/utils/test/src/utils/positive_int/positive_int.cc b/lib/utils/test/src/utils/positive_int/positive_int.cc index 25348d34da..d35ea83aad 100644 --- a/lib/utils/test/src/utils/positive_int/positive_int.cc +++ b/lib/utils/test/src/utils/positive_int/positive_int.cc @@ -1,5 +1,5 @@ -#include #include "utils/positive_int/positive_int.h" +#include using namespace ::FlexFlow; From cebd06cdb397b604c4bc9a0dc5a4ec0e92c16996 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 30 Apr 2025 09:10:15 -0700 Subject: [PATCH 75/91] Merge branch 'master' into local-e2e-training --- .github/runs-on.yml | 12 + .github/workflows/tests.yml | 2 +- .../unlabelled/find_pattern_matches.cc | 34 +- .../unlabelled/pattern_matching.cc | 33 +- .../test/src/substitutions/pcg_pattern.cc | 394 ++++++++++++------ .../algorithms/get_subgraph.h | 12 + .../algorithms/get_subgraph.cc | 165 ++++---- .../graph/open_dataflow_graph/get_subgraph.cc | 349 ++++++++++++++++ 8 files changed, 769 insertions(+), 232 deletions(-) create mode 100644 lib/utils/test/src/utils/graph/open_dataflow_graph/get_subgraph.cc diff --git a/.github/runs-on.yml b/.github/runs-on.yml index 75038549ab..a4fff33536 100644 --- a/.github/runs-on.yml +++ b/.github/runs-on.yml @@ -12,3 +12,15 @@ images: arch: "x64" owner: "135269210855" # runs-on name: "runs-on-v2.2-ubuntu22-full-x64-20250220122045" + + official-ubuntu-ami: + platform: "linux" + arch: "x64" + ami: "ami-0a60b027285c0d4c5" + + flexflow-gpu-ci: + platform: "linux" + arch: "x64" + owner: "409719625166" # flexflow + name: "flexflow-gpu-ci" + diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 861fcc1ea7..9d98fb07dd 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -7,7 +7,7 @@ concurrency: jobs: cpu-ci: name: CPU unit tests and build - runs-on: ubuntu-20.04 + runs-on: ubuntu-24.04 steps: - name: Checkout Git Repository diff --git a/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc b/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc index a7ebc0bff7..9d8e4bc259 100644 --- a/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc +++ b/lib/substitutions/src/substitutions/unlabelled/find_pattern_matches.cc @@ -11,6 +11,7 @@ #include "utils/graph/dataflow_graph/algorithms.h" #include "utils/graph/node/algorithms.h" #include "utils/graph/open_dataflow_graph/algorithms/get_inputs.h" +#include "utils/overload.h" namespace FlexFlow { @@ -67,6 +68,27 @@ static std::optional return match; } +MatchAdditionalCriterion additional_criterion_for_subpattern( + MatchAdditionalCriterion const &full_additional_criterion, + bidict const + &full_pattern_values_to_subpattern_inputs) { + return MatchAdditionalCriterion{ + full_additional_criterion.node_criterion, + [&](PatternValue const &patternValue, OpenDataflowValue const &pcgValue) { + return patternValue.visit( + overload{[&](PatternNodeOutput const &) -> bool { + return full_additional_criterion.value_criterion( + patternValue, pcgValue); + }, + [&](PatternInput const &i) -> bool { + PatternValue full_pattern_value = + full_pattern_values_to_subpattern_inputs.at_r(i); + return full_additional_criterion.value_criterion( + full_pattern_value, pcgValue); + }}); + }}; +} + std::vector find_pattern_matches(UnlabelledGraphPattern const &pattern, OpenDataflowGraphView const &graph, @@ -87,10 +109,18 @@ std::vector PatternSplitResult subpatterns = apply_split(pattern, split); std::vector prefix_matches = find_pattern_matches( - subpatterns.subpattern_1, graph, additional_criterion); + subpatterns.subpattern_1, + graph, + additional_criterion_for_subpattern( + additional_criterion, + subpatterns.full_pattern_values_to_subpattern_1_inputs)); std::vector postfix_matches = find_pattern_matches( - subpatterns.subpattern_2, graph, additional_criterion); + subpatterns.subpattern_2, + graph, + additional_criterion_for_subpattern( + additional_criterion, + subpatterns.full_pattern_values_to_subpattern_2_inputs)); for (UnlabelledDataflowGraphPatternMatch const &prefix_match : prefix_matches) { diff --git a/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc b/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc index 304bb8cf46..c7b03e24f2 100644 --- a/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc +++ b/lib/substitutions/src/substitutions/unlabelled/pattern_matching.cc @@ -7,10 +7,13 @@ #include "substitutions/unlabelled/unlabelled_graph_pattern.h" #include "utils/bidict/algorithms/left_entries.h" #include "utils/bidict/algorithms/right_entries.h" +#include "utils/containers/is_subseteq_of.h" #include "utils/containers/keys.h" #include "utils/containers/transform.h" +#include "utils/containers/values.h" #include "utils/graph/dataflow_graph/algorithms.h" #include "utils/graph/node/algorithms.h" +#include "utils/graph/open_dataflow_graph/algorithms/as_dot.h" #include "utils/graph/open_dataflow_graph/algorithms/get_edges.h" #include "utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_values.h" #include "utils/graph/open_dataflow_graph/algorithms/get_subgraph.h" @@ -18,6 +21,7 @@ #include "utils/graph/open_dataflow_graph/open_dataflow_edge.dtg.h" #include "utils/graph/open_dataflow_graph/open_dataflow_edge.h" #include "utils/overload.h" +#include #include namespace FlexFlow { @@ -46,8 +50,13 @@ struct SubgraphConcreteFromPattern { } OpenDataflowValue operator()(PatternInput const &i) const { - return OpenDataflowValue{full_graph_values_to_subgraph_inputs.at_l( - match.input_assignment.at(i))}; + OpenDataflowValue mapped_input = match.input_assignment.at(i); + if (full_graph_values_to_subgraph_inputs.contains_l(mapped_input)) { + return OpenDataflowValue{ + full_graph_values_to_subgraph_inputs.at_l(mapped_input)}; + } else { + return mapped_input; + } } OpenDataflowEdge operator()(InputPatternEdge const &e) const { @@ -148,11 +157,27 @@ bool unlabelled_pattern_does_match( UnlabelledDataflowGraphPatternMatch const &match, MatchAdditionalCriterion const &additional_criterion) { + std::unordered_set matched_by_pattern_inputs = + unordered_set_of(values(match.input_assignment)); + + ASSERT(left_entries(match.node_assignment) == get_nodes(pattern)); + ASSERT( + is_subseteq_of(right_entries(match.node_assignment), get_nodes(graph))); + ASSERT(keys(match.input_assignment) == get_graph_inputs(pattern)); + ASSERT(is_subseteq_of(matched_by_pattern_inputs, + get_open_dataflow_values(graph))); + OpenDataflowSubgraphResult subgraph_result = subgraph_matched(graph, match); OpenDataflowGraphView matched_subgraph = subgraph_result.graph; - assert(left_entries(match.node_assignment) == get_nodes(pattern)); - assert(right_entries(match.node_assignment) == get_nodes(matched_subgraph)); + std::unordered_set full_values_split_by_subgraph = + left_entries(subgraph_result.full_graph_values_to_subgraph_inputs); + + ASSERT(right_entries(match.node_assignment) == get_nodes(matched_subgraph)); + ASSERT(is_subseteq_of(full_values_split_by_subgraph, + get_open_dataflow_values(graph)), + full_values_split_by_subgraph, + get_open_dataflow_values(graph)); MatchAdditionalCriterion through_subgraph_operation = MatchAdditionalCriterion{ diff --git a/lib/substitutions/test/src/substitutions/pcg_pattern.cc b/lib/substitutions/test/src/substitutions/pcg_pattern.cc index 8ba1fee873..4dbf0885dd 100644 --- a/lib/substitutions/test/src/substitutions/pcg_pattern.cc +++ b/lib/substitutions/test/src/substitutions/pcg_pattern.cc @@ -13,144 +13,260 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("find_pattern_matches(PCGPattern, SubParallelComputationGraph)") { - ParallelComputationGraphBuilder builder; - - nonnegative_int batch_size = 16_n; - nonnegative_int batch_degree = 2_n; - nonnegative_int num_channels = 24_n; - - TensorShape a_shape = TensorShape{ - TensorDims{ - FFOrdered{ - batch_size, - num_channels, - }, - }, - DataType::FLOAT, - }; - - std::string a_name = "a"; - - parallel_tensor_guid_t a_tensor = builder.create_input_tensor(a_shape); - a_tensor = - builder.parallel_partition(a_tensor, ff_dim_t{0_n}, batch_degree); - - nonnegative_int outDim = 16_n; - std::string x_matmul_name = "x_matmul"; - std::string y_matmul_name = "y_matmul"; - parallel_tensor_guid_t t0 = - builder.dense(a_tensor, - outDim, - /*activation=*/std::nullopt, - /*use_bias=*/false, - DataType::FLOAT, - /*kernel_initializer=*/std::nullopt, - /*bias_initializer=*/std::nullopt, - x_matmul_name); - parallel_tensor_guid_t t1 = - builder.dense(a_tensor, - outDim, - /*activation=*/std::nullopt, - /*use_bias=*/false, - DataType::FLOAT, - /*kernel_initializer=*/std::nullopt, - /*bias_initializer=*/std::nullopt, - y_matmul_name); - parallel_tensor_guid_t t2 = builder.add(t0, t1); - - ParallelComputationGraph pcg = builder.pcg; - parallel_layer_guid_t x_matmul = - get_parallel_layer_by_name(pcg, x_matmul_name); - parallel_layer_guid_t y_matmul = - get_parallel_layer_by_name(pcg, y_matmul_name); - std::vector x_incoming = - get_incoming_tensors(pcg, x_matmul); - REQUIRE(x_incoming.size() == 2); - parallel_tensor_guid_t x_weights = x_incoming.at(1); - std::vector y_incoming = - get_incoming_tensors(pcg, y_matmul); - REQUIRE(y_incoming.size() == 2); - parallel_tensor_guid_t y_weights = y_incoming.at(1); - - LabelledOpenDataflowGraph - g = LabelledOpenDataflowGraph:: - create>(); - - TensorAttributePattern pattern_tensor_a = - tensor_attribute_pattern_match_all(); - TensorAttributePattern pattern_tensor_b = - tensor_attribute_pattern_match_all(); - TensorAttributePattern pattern_tensor_c = - tensor_attribute_pattern_match_all(); - TensorAttributePattern pattern_tensor_x = - tensor_attribute_pattern_match_all(); - TensorAttributePattern pattern_tensor_y = - tensor_attribute_pattern_match_all(); - - OperatorAttributePattern op_pattern_1 = OperatorAttributePattern{{ - op_type_equals_constraint(OperatorType::LINEAR), - }}; - - OperatorAttributePattern op_pattern_2 = op_pattern_1; - - DataflowGraphInput pt_a = g.add_input(pattern_tensor_a); - DataflowGraphInput pt_b = g.add_input(pattern_tensor_b); - DataflowGraphInput pt_c = g.add_input(pattern_tensor_c); - - NodeAddedResult op_pattern_1_added = - g.add_node(op_pattern_1, - {OpenDataflowValue{pt_a}, OpenDataflowValue{pt_b}}, - {pattern_tensor_x}); - PatternNode op_pattern_1_node = PatternNode{op_pattern_1_added.node}; - OpenDataflowValue pt_x = - OpenDataflowValue{get_only(op_pattern_1_added.outputs)}; - - NodeAddedResult op_pattern_2_added = - g.add_node(op_pattern_2, - {OpenDataflowValue{pt_a}, OpenDataflowValue{pt_c}}, - {pattern_tensor_y}); - PatternNode op_pattern_2_node = PatternNode{op_pattern_2_added.node}; - OpenDataflowValue pt_y = - OpenDataflowValue{get_only(op_pattern_2_added.outputs)}; - - PCGPattern pattern = PCGPattern{g}; - - std::unordered_set result = unordered_set_of( - find_pattern_matches(pattern, sub_pcg_from_full_pcg(pcg))); - - PCGPatternMatch match1 = - PCGPatternMatch{bidict{ - {op_pattern_1_node, x_matmul}, - {op_pattern_2_node, y_matmul}, - }, - bidict{ - {PatternInput{pt_a}, - open_parallel_tensor_guid_from_closed(a_tensor)}, - {PatternInput{pt_b}, - open_parallel_tensor_guid_from_closed(x_weights)}, - {PatternInput{pt_c}, - open_parallel_tensor_guid_from_closed(y_weights)}, - }}; - - PCGPatternMatch match2 = - PCGPatternMatch{bidict{ - {op_pattern_1_node, y_matmul}, - {op_pattern_2_node, x_matmul}, - }, - bidict{ - {PatternInput{pt_a}, - open_parallel_tensor_guid_from_closed(a_tensor)}, - {PatternInput{pt_b}, - open_parallel_tensor_guid_from_closed(y_weights)}, - {PatternInput{pt_c}, - open_parallel_tensor_guid_from_closed(x_weights)}, - }}; - - std::unordered_set correct = {match1, match2}; - - CHECK(result == correct); + SUBCASE("simple case") { + ParallelComputationGraphBuilder builder; + + nonnegative_int batch_size = 16_n; + nonnegative_int batch_degree = 2_n; + nonnegative_int num_channels = 24_n; + + TensorShape a_shape = TensorShape{ + TensorDims{ + FFOrdered{ + batch_size, + num_channels, + }, + }, + DataType::FLOAT, + }; + + std::string a_name = "a"; + + parallel_tensor_guid_t a_tensor = builder.create_input_tensor(a_shape); + a_tensor = + builder.parallel_partition(a_tensor, ff_dim_t{0_n}, batch_degree); + + nonnegative_int outDim = 16_n; + std::string x_matmul_name = "x_matmul"; + std::string y_matmul_name = "y_matmul"; + parallel_tensor_guid_t t0 = + builder.dense(a_tensor, + outDim, + /*activation=*/std::nullopt, + /*use_bias=*/false, + DataType::FLOAT, + /*kernel_initializer=*/std::nullopt, + /*bias_initializer=*/std::nullopt, + x_matmul_name); + parallel_tensor_guid_t t1 = + builder.dense(a_tensor, + outDim, + /*activation=*/std::nullopt, + /*use_bias=*/false, + DataType::FLOAT, + /*kernel_initializer=*/std::nullopt, + /*bias_initializer=*/std::nullopt, + y_matmul_name); + parallel_tensor_guid_t t2 = builder.add(t0, t1); + + ParallelComputationGraph pcg = builder.pcg; + parallel_layer_guid_t x_matmul = + get_parallel_layer_by_name(pcg, x_matmul_name); + parallel_layer_guid_t y_matmul = + get_parallel_layer_by_name(pcg, y_matmul_name); + std::vector x_incoming = + get_incoming_tensors(pcg, x_matmul); + REQUIRE(x_incoming.size() == 2); + parallel_tensor_guid_t x_weights = x_incoming.at(1); + std::vector y_incoming = + get_incoming_tensors(pcg, y_matmul); + REQUIRE(y_incoming.size() == 2); + parallel_tensor_guid_t y_weights = y_incoming.at(1); + + LabelledOpenDataflowGraph + g = LabelledOpenDataflowGraph:: + create>(); + + TensorAttributePattern pattern_tensor_a = + tensor_attribute_pattern_match_all(); + TensorAttributePattern pattern_tensor_b = + tensor_attribute_pattern_match_all(); + TensorAttributePattern pattern_tensor_c = + tensor_attribute_pattern_match_all(); + TensorAttributePattern pattern_tensor_x = + tensor_attribute_pattern_match_all(); + TensorAttributePattern pattern_tensor_y = + tensor_attribute_pattern_match_all(); + + OperatorAttributePattern op_pattern_1 = OperatorAttributePattern{{ + op_type_equals_constraint(OperatorType::LINEAR), + }}; + + OperatorAttributePattern op_pattern_2 = op_pattern_1; + + DataflowGraphInput pt_a = g.add_input(pattern_tensor_a); + DataflowGraphInput pt_b = g.add_input(pattern_tensor_b); + DataflowGraphInput pt_c = g.add_input(pattern_tensor_c); + + NodeAddedResult op_pattern_1_added = + g.add_node(op_pattern_1, + {OpenDataflowValue{pt_a}, OpenDataflowValue{pt_b}}, + {pattern_tensor_x}); + PatternNode op_pattern_1_node = PatternNode{op_pattern_1_added.node}; + OpenDataflowValue pt_x = + OpenDataflowValue{get_only(op_pattern_1_added.outputs)}; + + NodeAddedResult op_pattern_2_added = + g.add_node(op_pattern_2, + {OpenDataflowValue{pt_a}, OpenDataflowValue{pt_c}}, + {pattern_tensor_y}); + PatternNode op_pattern_2_node = PatternNode{op_pattern_2_added.node}; + OpenDataflowValue pt_y = + OpenDataflowValue{get_only(op_pattern_2_added.outputs)}; + + PCGPattern pattern = PCGPattern{g}; + + std::unordered_set result = unordered_set_of( + find_pattern_matches(pattern, sub_pcg_from_full_pcg(pcg))); + + PCGPatternMatch match1 = PCGPatternMatch{ + bidict{ + {op_pattern_1_node, x_matmul}, + {op_pattern_2_node, y_matmul}, + }, + bidict{ + {PatternInput{pt_a}, + open_parallel_tensor_guid_from_closed(a_tensor)}, + {PatternInput{pt_b}, + open_parallel_tensor_guid_from_closed(x_weights)}, + {PatternInput{pt_c}, + open_parallel_tensor_guid_from_closed(y_weights)}, + }}; + + PCGPatternMatch match2 = PCGPatternMatch{ + bidict{ + {op_pattern_1_node, y_matmul}, + {op_pattern_2_node, x_matmul}, + }, + bidict{ + {PatternInput{pt_a}, + open_parallel_tensor_guid_from_closed(a_tensor)}, + {PatternInput{pt_b}, + open_parallel_tensor_guid_from_closed(y_weights)}, + {PatternInput{pt_c}, + open_parallel_tensor_guid_from_closed(x_weights)}, + }}; + + std::unordered_set correct = {match1, match2}; + + CHECK(result == correct); + } + + SUBCASE("pcg is a chain") { + ParallelComputationGraphBuilder builder; + + nonnegative_int batch_size = 16_n; + nonnegative_int batch_degree = 2_n; + nonnegative_int num_channels = 24_n; + + TensorShape a_shape = TensorShape{ + TensorDims{ + FFOrdered{ + batch_size, + num_channels, + }, + }, + DataType::FLOAT, + }; + + std::string a_name = "a"; + + parallel_tensor_guid_t a_tensor = builder.create_input_tensor(a_shape); + a_tensor = + builder.parallel_partition(a_tensor, ff_dim_t{0_n}, batch_degree); + + nonnegative_int outDim = 16_n; + std::string x_matmul_name = "x_matmul"; + std::string y_matmul_name = "y_matmul"; + parallel_tensor_guid_t t0 = + builder.dense(a_tensor, + outDim, + /*activation=*/std::nullopt, + /*use_bias=*/false, + DataType::FLOAT, + /*kernel_initializer=*/std::nullopt, + /*bias_initializer=*/std::nullopt, + x_matmul_name); + parallel_tensor_guid_t t1 = + builder.dense(t0, + outDim, + /*activation=*/std::nullopt, + /*use_bias=*/false, + DataType::FLOAT, + /*kernel_initializer=*/std::nullopt, + /*bias_initializer=*/std::nullopt, + y_matmul_name); + parallel_tensor_guid_t t2 = + builder.dense(t1, + outDim, + /*activation=*/std::nullopt, + /*use_bias=*/false, + DataType::FLOAT, + /*kernel_initializer=*/std::nullopt, + /*bias_initializer=*/std::nullopt); + parallel_tensor_guid_t t3 = + builder.dense(t2, + outDim, + /*activation=*/std::nullopt, + /*use_bias=*/false, + DataType::FLOAT, + /*kernel_initializer=*/std::nullopt, + /*bias_initializer=*/std::nullopt); + ParallelComputationGraph pcg = builder.pcg; + + LabelledOpenDataflowGraph + g = LabelledOpenDataflowGraph:: + create>(); + + TensorAttributePattern pattern_tensor_a = + tensor_attribute_pattern_match_all(); + TensorAttributePattern pattern_tensor_b = + tensor_attribute_pattern_match_all(); + TensorAttributePattern pattern_tensor_c = + tensor_attribute_pattern_match_all(); + TensorAttributePattern pattern_tensor_x = + tensor_attribute_pattern_match_all(); + TensorAttributePattern pattern_tensor_y = + tensor_attribute_pattern_match_all(); + + OperatorAttributePattern op_pattern_1 = OperatorAttributePattern{{ + op_type_equals_constraint(OperatorType::LINEAR), + }}; + + OperatorAttributePattern op_pattern_2 = op_pattern_1; + + DataflowGraphInput pt_a = g.add_input(pattern_tensor_a); + DataflowGraphInput pt_b = g.add_input(pattern_tensor_b); + DataflowGraphInput pt_c = g.add_input(pattern_tensor_c); + + NodeAddedResult op_pattern_1_added = + g.add_node(op_pattern_1, + {OpenDataflowValue{pt_a}, OpenDataflowValue{pt_b}}, + {pattern_tensor_x}); + PatternNode op_pattern_1_node = PatternNode{op_pattern_1_added.node}; + OpenDataflowValue pt_x = + OpenDataflowValue{get_only(op_pattern_1_added.outputs)}; + + NodeAddedResult op_pattern_2_added = + g.add_node(op_pattern_2, + {OpenDataflowValue{pt_x}, OpenDataflowValue{pt_c}}, + {pattern_tensor_y}); + PatternNode op_pattern_2_node = PatternNode{op_pattern_2_added.node}; + + PCGPattern pattern = PCGPattern{g}; + + std::unordered_set result = unordered_set_of( + find_pattern_matches(pattern, sub_pcg_from_full_pcg(pcg))); + + CHECK(result.size() == 3); + } } } diff --git a/lib/utils/include/utils/graph/open_dataflow_graph/algorithms/get_subgraph.h b/lib/utils/include/utils/graph/open_dataflow_graph/algorithms/get_subgraph.h index 202058a3d1..f5bbbc228d 100644 --- a/lib/utils/include/utils/graph/open_dataflow_graph/algorithms/get_subgraph.h +++ b/lib/utils/include/utils/graph/open_dataflow_graph/algorithms/get_subgraph.h @@ -1,6 +1,7 @@ #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_OPEN_DATAFLOW_GRAPH_ALGORITHMS_GET_SUBGRAPH_H #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_OPEN_DATAFLOW_GRAPH_ALGORITHMS_GET_SUBGRAPH_H +#include "utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_data.dtg.h" #include "utils/graph/open_dataflow_graph/algorithms/open_dataflow_subgraph_result.dtg.h" #include "utils/graph/open_dataflow_graph/open_dataflow_graph_view.h" #include "utils/graph/open_dataflow_graph/open_dataflow_value.dtg.h" @@ -10,6 +11,17 @@ namespace FlexFlow { OpenDataflowSubgraphResult get_subgraph(OpenDataflowGraphView const &, std::unordered_set const &); +bidict + get_full_graph_values_to_subgraph_inputs( + OpenDataflowGraphView const &g, + std::unordered_set const &subgraph_nodes); + +OpenDataflowGraphData + get_subgraph_data(OpenDataflowGraphView const &g, + std::unordered_set const &subgraph_nodes, + bidict const + &full_graph_values_to_subgraph_inputs); + } // namespace FlexFlow #endif diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_subgraph.cc b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_subgraph.cc index ad3d4f26c0..36f027f792 100644 --- a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_subgraph.cc +++ b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_subgraph.cc @@ -4,7 +4,11 @@ #include "utils/containers/is_subseteq_of.h" #include "utils/containers/unordered_set_of.h" #include "utils/containers/values.h" +#include "utils/graph/dataflow_graph/dataflow_output_query.h" #include "utils/graph/node/algorithms.h" +#include "utils/graph/open_dataflow_graph/algorithms/from_open_dataflow_graph_data.h" +#include "utils/graph/open_dataflow_graph/algorithms/get_edges.h" +#include "utils/graph/open_dataflow_graph/algorithms/get_subgraph_incoming_edges.h" #include "utils/graph/open_dataflow_graph/algorithms/get_subgraph_inputs.h" #include "utils/graph/open_dataflow_graph/dataflow_graph_input_source.h" #include "utils/graph/open_dataflow_graph/open_dataflow_value.dtg.h" @@ -13,100 +17,89 @@ namespace FlexFlow { -struct OpenDataflowSubgraph final : public IOpenDataflowGraphView { - OpenDataflowSubgraph(OpenDataflowGraphView const &full_graph, - std::unordered_set const &subgraph_nodes, - bidict const - &full_graph_values_to_subgraph_inputs) - : full_graph(full_graph), subgraph_nodes(subgraph_nodes), - full_graph_values_to_subgraph_inputs( - full_graph_values_to_subgraph_inputs) { - assert(is_subseteq_of(this->subgraph_nodes, get_nodes(full_graph))); - } - - std::unordered_set query_nodes(NodeQuery const &q) const override { - return intersection(this->full_graph.query_nodes(q), this->subgraph_nodes); - } - - std::unordered_set - query_edges(OpenDataflowEdgeQuery const &q) const override { - std::unordered_set result; - for (OpenDataflowEdge const &open_e : this->full_graph.query_edges(q)) { - open_e.visit(overload{ - [&](DataflowEdge const &e) { - bool contains_src = contains(this->subgraph_nodes, e.src.node); - bool contains_dst = contains(this->subgraph_nodes, e.dst.node); - if (contains_src && contains_dst) { - result.insert(OpenDataflowEdge{e}); - } else if (contains_dst && !contains_src) { - result.insert(OpenDataflowEdge{DataflowInputEdge{ - this->full_graph_values_to_subgraph_inputs.at_l( - OpenDataflowValue{e.src}), - e.dst}}); - } - return std::nullopt; - }, - [&](DataflowInputEdge const &e) { - if (contains(this->subgraph_nodes, e.dst.node)) { - result.insert(OpenDataflowEdge{DataflowInputEdge{ - this->full_graph_values_to_subgraph_inputs.at_l( - OpenDataflowValue{e.src}), - e.dst}}); - } - return std::nullopt; - }}); - } - return result; - } - - std::unordered_set - query_outputs(DataflowOutputQuery const &q) const override { - return filter(this->full_graph.query_outputs(q), - [&](DataflowOutput const &o) { - return contains(this->subgraph_nodes, o.node); - }); - } - - std::unordered_set get_inputs() const override { - return unordered_set_of(values(this->full_graph_values_to_subgraph_inputs)); - }; - - OpenDataflowSubgraph *clone() const override { - return new OpenDataflowSubgraph{ - this->full_graph, - this->subgraph_nodes, - this->full_graph_values_to_subgraph_inputs, - }; - } - -private: - OpenDataflowGraphView full_graph; - std::unordered_set subgraph_nodes; - bidict - full_graph_values_to_subgraph_inputs; -}; - OpenDataflowSubgraphResult get_subgraph(OpenDataflowGraphView const &g, std::unordered_set const &subgraph_nodes) { - DataflowGraphInputSource input_source; bidict - full_graph_values_to_subgraph_inputs = generate_bidict( - get_subgraph_inputs(g, subgraph_nodes), - [&](OpenDataflowValue const &v) -> DataflowGraphInput { - return v.visit(overload{ - [](DataflowGraphInput const &i) { return i; }, - [&](DataflowOutput const &) { - return input_source.new_dataflow_graph_input(); - }, - }); - }); + full_graph_values_to_subgraph_inputs = + get_full_graph_values_to_subgraph_inputs(g, subgraph_nodes); return OpenDataflowSubgraphResult{ - OpenDataflowGraphView::create( - g, subgraph_nodes, full_graph_values_to_subgraph_inputs), + OpenDataflowGraphView::create( + get_subgraph_data( + g, subgraph_nodes, full_graph_values_to_subgraph_inputs)), full_graph_values_to_subgraph_inputs, }; } +bidict + get_full_graph_values_to_subgraph_inputs( + OpenDataflowGraphView const &g, + std::unordered_set const &subgraph_nodes) { + DataflowGraphInputSource input_source; + return generate_bidict(get_subgraph_inputs(g, subgraph_nodes), + [&](OpenDataflowValue const &v) -> DataflowGraphInput { + return v.visit(overload{ + [](DataflowGraphInput const &i) { return i; }, + [&](DataflowOutput const &) { + return input_source.new_dataflow_graph_input(); + }, + }); + }); +} + +OpenDataflowGraphData + get_subgraph_data(OpenDataflowGraphView const &g, + std::unordered_set const &subgraph_nodes, + bidict const + &full_graph_values_to_subgraph_inputs) { + std::unordered_set subgraph_input_edges = + transform(get_subgraph_incoming_edges(g, subgraph_nodes), + [&](OpenDataflowEdge const &edge) { + return edge.visit( + overload{[&](DataflowInputEdge const &e) { + return OpenDataflowEdge{DataflowInputEdge{ + full_graph_values_to_subgraph_inputs.at_l( + OpenDataflowValue{e.src}), + e.dst}}; + }, + [&](DataflowEdge const &e) { + return OpenDataflowEdge{DataflowInputEdge{ + full_graph_values_to_subgraph_inputs.at_l( + OpenDataflowValue{e.src}), + e.dst}}; + }}); + }); + + OpenDataflowEdgeQuery subgraph_interior_edges_query = OpenDataflowEdgeQuery{ + DataflowInputEdgeQuery{ + query_set::match_none(), + query_set::match_none(), + query_set::match_none(), + }, + DataflowEdgeQuery{ + query_set{subgraph_nodes}, + query_set::matchall(), + query_set{subgraph_nodes}, + query_set::matchall(), + }, + }; + std::unordered_set subgraph_interior_edges = + g.query_edges(subgraph_interior_edges_query); + + std::unordered_set subgraph_inputs = + unordered_set_of(values(full_graph_values_to_subgraph_inputs)); + std::unordered_set subgraph_outputs = + filter(g.query_outputs(dataflow_output_query_all()), + [&](DataflowOutput const &o) { + return contains(subgraph_nodes, o.node); + }); + return OpenDataflowGraphData{ + subgraph_nodes, + set_union(subgraph_input_edges, subgraph_interior_edges), + subgraph_inputs, + subgraph_outputs, + }; +} + } // namespace FlexFlow diff --git a/lib/utils/test/src/utils/graph/open_dataflow_graph/get_subgraph.cc b/lib/utils/test/src/utils/graph/open_dataflow_graph/get_subgraph.cc new file mode 100644 index 0000000000..c44e5f81b7 --- /dev/null +++ b/lib/utils/test/src/utils/graph/open_dataflow_graph/get_subgraph.cc @@ -0,0 +1,349 @@ +#include "utils/graph/open_dataflow_graph/algorithms/get_subgraph.h" +#include "utils/bidict/algorithms/left_entries.h" +#include "utils/containers/contains.h" +#include "utils/containers/get_only.h" +#include "utils/graph/instances/unordered_set_dataflow_graph.h" +#include "utils/graph/node/algorithms.h" +#include "utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_values.h" +#include "utils/graph/open_dataflow_graph/open_dataflow_graph.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("get_full_graph_values_to_subgraph_inputs(OpenDataflowGraphView, " + "std::unordered_set) ") { + OpenDataflowGraph graph = + OpenDataflowGraph::create(); + + DataflowGraphInput i0 = graph.add_input(); + DataflowGraphInput i1 = graph.add_input(); + DataflowGraphInput i2 = graph.add_input(); + + NodeAddedResult n0_added = graph.add_node({OpenDataflowValue{i0}}, 1_n); + Node n0 = n0_added.node; + OpenDataflowValue v0 = OpenDataflowValue{get_only(n0_added.outputs)}; + + NodeAddedResult n1_added = graph.add_node({v0, OpenDataflowValue{i1}}, 1_n); + Node n1 = n1_added.node; + OpenDataflowValue v1 = OpenDataflowValue{get_only(n1_added.outputs)}; + + NodeAddedResult n2_added = graph.add_node({v0}, 1_n); + Node n2 = n2_added.node; + OpenDataflowValue v2 = OpenDataflowValue{get_only(n2_added.outputs)}; + + NodeAddedResult n3_added = + graph.add_node({OpenDataflowValue{i2}, v1, v2}, 1_n); + Node n3 = n3_added.node; + + std::unordered_set subgraph_nodes = {n1, n2, n3}; + + bidict + full_graph_values_to_subgraph_inputs = + get_full_graph_values_to_subgraph_inputs(graph, subgraph_nodes); + + SUBCASE("left entries are correct") { + std::unordered_set correct = { + v0, OpenDataflowValue{i1}, OpenDataflowValue{i2}}; + CHECK(left_entries(full_graph_values_to_subgraph_inputs) == correct); + } + + SUBCASE("mapping is correct") { + CHECK(full_graph_values_to_subgraph_inputs.at_l(OpenDataflowValue{i1}) == + i1); + CHECK(full_graph_values_to_subgraph_inputs.at_l(OpenDataflowValue{i2}) == + i2); + std::unordered_set inputs = {i1, i2}; + CHECK(!contains(inputs, full_graph_values_to_subgraph_inputs.at_l(v0))); + } + } + + TEST_CASE( + "get_subgraph_data(OpenDataflowGraphView, std::unordered_set, " + "bidict)") { + SUBCASE("2-node graph without inputs") { + OpenDataflowGraph graph = + OpenDataflowGraph::create(); + + NodeAddedResult n0_added = graph.add_node({}, 1_n); + Node n0 = n0_added.node; + OpenDataflowValue v0 = OpenDataflowValue{get_only(n0_added.outputs)}; + + NodeAddedResult n1_added = graph.add_node({v0}, 1_n); + Node n1 = n1_added.node; + + SUBCASE("subgraph is full graph") { + std::unordered_set subgraph_nodes = {n0, n1}; + + bidict + full_graph_values_to_subgraph_inputs = + get_full_graph_values_to_subgraph_inputs(graph, subgraph_nodes); + + OpenDataflowGraphData result = get_subgraph_data( + graph, subgraph_nodes, full_graph_values_to_subgraph_inputs); + OpenDataflowGraphData correct = OpenDataflowGraphData{ + subgraph_nodes, + {OpenDataflowEdge{ + DataflowEdge{DataflowOutput{n0, 0_n}, DataflowInput{n1, 0_n}}}}, + {}, + { + DataflowOutput{ + n0, + 0_n, + }, + DataflowOutput{ + n1, + 0_n, + }, + }}; + CHECK(result == correct); + } + + SUBCASE("subgraph is n0") { + std::unordered_set subgraph_nodes = {n0}; + + bidict + full_graph_values_to_subgraph_inputs = + get_full_graph_values_to_subgraph_inputs(graph, subgraph_nodes); + + OpenDataflowGraphData result = get_subgraph_data( + graph, subgraph_nodes, full_graph_values_to_subgraph_inputs); + OpenDataflowGraphData correct = OpenDataflowGraphData{subgraph_nodes, + {}, + {}, + {DataflowOutput{ + n0, + 0_n, + }}}; + CHECK(result == correct); + } + + SUBCASE("subgraph is n1") { + std::unordered_set subgraph_nodes = {n1}; + + bidict + full_graph_values_to_subgraph_inputs = + get_full_graph_values_to_subgraph_inputs(graph, subgraph_nodes); + + OpenDataflowGraphData result = get_subgraph_data( + graph, subgraph_nodes, full_graph_values_to_subgraph_inputs); + + DataflowGraphInput n0_as_subgraph_input = + full_graph_values_to_subgraph_inputs.at_l(v0); + + OpenDataflowGraphData correct = OpenDataflowGraphData{ + subgraph_nodes, + {OpenDataflowEdge{DataflowInputEdge{n0_as_subgraph_input, + DataflowInput{n1, 0_n}}}}, + {n0_as_subgraph_input}, + {DataflowOutput{ + n1, + 0_n, + }}}; + CHECK(result == correct); + } + + SUBCASE("subgraph is empty") { + std::unordered_set subgraph_nodes = {}; + + bidict + full_graph_values_to_subgraph_inputs = + get_full_graph_values_to_subgraph_inputs(graph, subgraph_nodes); + + OpenDataflowGraphData result = get_subgraph_data( + graph, subgraph_nodes, full_graph_values_to_subgraph_inputs); + OpenDataflowGraphData correct = + OpenDataflowGraphData{subgraph_nodes, {}, {}, {}}; + CHECK(result == correct); + } + } + + SUBCASE("3-node graph with inputs") { + OpenDataflowGraph graph = + OpenDataflowGraph::create(); + + DataflowGraphInput i0 = graph.add_input(); + DataflowGraphInput i1 = graph.add_input(); + + NodeAddedResult n0_added = graph.add_node({OpenDataflowValue{i0}}, 1_n); + Node n0 = n0_added.node; + OpenDataflowValue v0 = OpenDataflowValue{get_only(n0_added.outputs)}; + + NodeAddedResult n1_added = + graph.add_node({v0, OpenDataflowValue{i1}}, 1_n); + Node n1 = n1_added.node; + + NodeAddedResult n2_added = graph.add_node({v0}, 1_n); + Node n2 = n2_added.node; + + SUBCASE("subgraph is full graph") { + std::unordered_set subgraph_nodes = {n0, n1, n2}; + + bidict + full_graph_values_to_subgraph_inputs = + get_full_graph_values_to_subgraph_inputs(graph, subgraph_nodes); + + OpenDataflowGraphData result = get_subgraph_data( + graph, subgraph_nodes, full_graph_values_to_subgraph_inputs); + + OpenDataflowGraphData correct = OpenDataflowGraphData{ + subgraph_nodes, + { + OpenDataflowEdge{DataflowInputEdge{i0, DataflowInput{n0, 0_n}}}, + OpenDataflowEdge{DataflowInputEdge{i1, DataflowInput{n1, 1_n}}}, + OpenDataflowEdge{DataflowEdge{DataflowOutput{n0, 0_n}, + DataflowInput{n1, 0_n}}}, + OpenDataflowEdge{{DataflowEdge{DataflowOutput{n0, 0_n}, + DataflowInput{n2, 0_n}}}}, + }, + {i0, i1}, + { + DataflowOutput{ + n0, + 0_n, + }, + DataflowOutput{ + n1, + 0_n, + }, + DataflowOutput{ + n2, + 0_n, + }, + }}; + CHECK(result == correct); + } + + SUBCASE("subgraph is (n0, n1) split") { + std::unordered_set subgraph_nodes = {n0, n1}; + + bidict + full_graph_values_to_subgraph_inputs = + get_full_graph_values_to_subgraph_inputs(graph, subgraph_nodes); + + OpenDataflowGraphData result = get_subgraph_data( + graph, subgraph_nodes, full_graph_values_to_subgraph_inputs); + + OpenDataflowGraphData correct = OpenDataflowGraphData{ + subgraph_nodes, + { + OpenDataflowEdge{DataflowInputEdge{i0, DataflowInput{n0, 0_n}}}, + OpenDataflowEdge{DataflowInputEdge{i1, DataflowInput{n1, 1_n}}}, + OpenDataflowEdge{DataflowEdge{DataflowOutput{n0, 0_n}, + DataflowInput{n1, 0_n}}}, + }, + {i0, i1}, + { + DataflowOutput{ + n0, + 0_n, + }, + DataflowOutput{ + n1, + 0_n, + }, + }}; + CHECK(result == correct); + } + + SUBCASE("subgraph is (n0, n1) split") { + std::unordered_set subgraph_nodes = {n0, n1}; + + bidict + full_graph_values_to_subgraph_inputs = + get_full_graph_values_to_subgraph_inputs(graph, subgraph_nodes); + + OpenDataflowGraphData result = get_subgraph_data( + graph, subgraph_nodes, full_graph_values_to_subgraph_inputs); + + OpenDataflowGraphData correct = OpenDataflowGraphData{ + subgraph_nodes, + { + OpenDataflowEdge{DataflowInputEdge{i0, DataflowInput{n0, 0_n}}}, + OpenDataflowEdge{DataflowInputEdge{i1, DataflowInput{n1, 1_n}}}, + OpenDataflowEdge{DataflowEdge{DataflowOutput{n0, 0_n}, + DataflowInput{n1, 0_n}}}, + }, + {i0, i1}, + { + DataflowOutput{ + n0, + 0_n, + }, + DataflowOutput{ + n1, + 0_n, + }, + }}; + CHECK(result == correct); + } + + SUBCASE("subgraph is (n0, n2) split") { + std::unordered_set subgraph_nodes = {n0, n2}; + + bidict + full_graph_values_to_subgraph_inputs = + get_full_graph_values_to_subgraph_inputs(graph, subgraph_nodes); + + OpenDataflowGraphData result = get_subgraph_data( + graph, subgraph_nodes, full_graph_values_to_subgraph_inputs); + + OpenDataflowGraphData correct = OpenDataflowGraphData{ + subgraph_nodes, + { + OpenDataflowEdge{DataflowInputEdge{i0, DataflowInput{n0, 0_n}}}, + OpenDataflowEdge{DataflowEdge{DataflowOutput{n0, 0_n}, + DataflowInput{n2, 0_n}}}, + }, + {i0}, + { + DataflowOutput{ + n0, + 0_n, + }, + DataflowOutput{ + n2, + 0_n, + }, + }}; + CHECK(result == correct); + } + + SUBCASE("subgraph is (n1, n2) split") { + std::unordered_set subgraph_nodes = {n1, n2}; + + bidict + full_graph_values_to_subgraph_inputs = + get_full_graph_values_to_subgraph_inputs(graph, subgraph_nodes); + + OpenDataflowGraphData result = get_subgraph_data( + graph, subgraph_nodes, full_graph_values_to_subgraph_inputs); + + DataflowGraphInput n0_as_subgraph_input = + full_graph_values_to_subgraph_inputs.at_l(OpenDataflowValue{v0}); + + OpenDataflowGraphData correct = OpenDataflowGraphData{ + subgraph_nodes, + { + OpenDataflowEdge{DataflowInputEdge{i1, DataflowInput{n1, 1_n}}}, + OpenDataflowEdge{DataflowInputEdge{n0_as_subgraph_input, + DataflowInput{n1, 0_n}}}, + OpenDataflowEdge{DataflowInputEdge{n0_as_subgraph_input, + DataflowInput{n2, 0_n}}}, + }, + {i1, n0_as_subgraph_input}, + { + DataflowOutput{ + n1, + 0_n, + }, + DataflowOutput{ + n2, + 0_n, + }, + }}; + CHECK(result == correct); + } + } + } +} From ea1a6dfb3d7d31aed39dc947348999d4447c5185 Mon Sep 17 00:00:00 2001 From: Colin Unger Date: Fri, 23 May 2025 05:04:48 +0000 Subject: [PATCH 76/91] Add tests for positive_int --- .../include/utils/positive_int/positive_int.h | 2 +- .../src/utils/positive_int/positive_int.cc | 4 +- .../src/utils/positive_int/positive_int.cc | 479 +++++++++++++++++- 3 files changed, 480 insertions(+), 5 deletions(-) diff --git a/lib/utils/include/utils/positive_int/positive_int.h b/lib/utils/include/utils/positive_int/positive_int.h index 9ff0f4da64..6ddddadf50 100644 --- a/lib/utils/include/utils/positive_int/positive_int.h +++ b/lib/utils/include/utils/positive_int/positive_int.h @@ -69,7 +69,7 @@ struct positive_int { friend float &operator/=(float &lhs, positive_int rhs); nonnegative_int operator%(positive_int other) const; - nonnegative_int operator%(nonnegative_int other) const; + friend nonnegative_int operator%(nonnegative_int lhs, positive_int rhs); int int_from_positive_int() const; nonnegative_int nonnegative_int_from_positive_int() const; diff --git a/lib/utils/src/utils/positive_int/positive_int.cc b/lib/utils/src/utils/positive_int/positive_int.cc index 93d4d17148..3c4b0b4440 100644 --- a/lib/utils/src/utils/positive_int/positive_int.cc +++ b/lib/utils/src/utils/positive_int/positive_int.cc @@ -215,8 +215,8 @@ nonnegative_int positive_int::operator%(positive_int other) const { return nonnegative_int{this->value_ % other.value_}; } -nonnegative_int positive_int::operator%(nonnegative_int other) const { - return nonnegative_int{this->value_ % other.unwrap_nonnegative()}; +nonnegative_int operator%(nonnegative_int lhs, positive_int rhs) { + return nonnegative_int{lhs.unwrap_nonnegative() % rhs.value_}; } int positive_int::int_from_positive_int() const { diff --git a/lib/utils/test/src/utils/positive_int/positive_int.cc b/lib/utils/test/src/utils/positive_int/positive_int.cc index d35ea83aad..77ecbf854d 100644 --- a/lib/utils/test/src/utils/positive_int/positive_int.cc +++ b/lib/utils/test/src/utils/positive_int/positive_int.cc @@ -1,10 +1,485 @@ #include "utils/positive_int/positive_int.h" #include +#include "test/utils/rapidcheck.h" using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("positive_int") { - CHECK_MESSAGE(false, "TODO: positive_int"); + TEST_CASE("positive_int{int}") { + int x1 = 3; + int x2 = 4; + + int zero = 0; + int negative = -3; + + CHECK(positive_int{x1} == positive_int{x1}); + CHECK(positive_int{x2} != positive_int{x1}); + + CHECK_THROWS(positive_int{zero}); + CHECK_THROWS(positive_int{negative}); + } + + TEST_CASE("positive_int{size_t}") { + size_t x1 = 3; + size_t x2 = 4; + + size_t zero = 0; + + size_t maxint = static_cast(std::numeric_limits::max()); + size_t overflow1 = static_cast(std::numeric_limits::max()) + 1; + size_t overflow2 = static_cast(std::numeric_limits::max()) + 2; + + CHECK(positive_int{x1} == positive_int{x1}); + CHECK(positive_int{x2} != positive_int{x1}); + + CHECK_THROWS(positive_int{zero}); + CHECK(positive_int{maxint} == positive_int{maxint}); + CHECK_THROWS(positive_int{overflow1}); + CHECK_THROWS(positive_int{overflow2}); + } + + TEST_CASE("positive_int{nonnegative_int}") { + nonnegative_int x1 = 3_n; + nonnegative_int x2 = 4_n; + + nonnegative_int zero = 0_n; + + CHECK(positive_int{x1} == positive_int{x1}); + CHECK(positive_int{x2} != positive_int{x1}); + + CHECK_THROWS(positive_int{zero}); + } + + TEST_CASE("_p notation for positive_int") { + CHECK(9_p == positive_int{9}); + CHECK_THROWS(0_p); + } + + TEST_CASE("static_cast(positive_int)") { + CHECK(static_cast(8_p) == 8); + } + + TEST_CASE("static_cast(positive_int)") { + CHECK(static_cast(6_p) == 6); + } + + TEST_CASE("positive_int < positive_int") { + CHECK(4_p < 5_p); + CHECK_FALSE(7_p < 7_p); + CHECK_FALSE(3_p < 2_p); + } + + TEST_CASE("positive_int == positive_int") { + CHECK_FALSE(4_p == 5_p); + CHECK(7_p == 7_p); + CHECK_FALSE(3_p == 2_p); + } + + TEST_CASE("positive_int > positive_int") { + CHECK_FALSE(4_p > 5_p); + CHECK_FALSE(7_p > 7_p); + CHECK(3_p > 2_p); + } + + TEST_CASE("positive_int <= positive_int") { + CHECK(4_p <= 5_p); + CHECK(7_p <= 7_p); + CHECK_FALSE(3_p <= 2_p); + } + + TEST_CASE("positive_int != positive_int") { + CHECK(4_p != 5_p); + CHECK_FALSE(7_p != 7_p); + CHECK(3_p != 2_p); + } + + TEST_CASE("positive_int >= positive_int") { + CHECK_FALSE(4_p >= 5_p); + CHECK(7_p >= 7_p); + CHECK(3_p >= 2_p); + } + + TEST_CASE("positive_int < nonnegative_int") { + CHECK(4_p < 5_n); + CHECK_FALSE(7_p < 7_n); + CHECK_FALSE(3_p < 2_n); + CHECK_FALSE(1_p < 0_n); + } + + TEST_CASE("positive_int == nonnegative_int") { + CHECK_FALSE(4_p == 5_n); + CHECK(7_p == 7_n); + CHECK_FALSE(3_p == 2_n); + CHECK_FALSE(1_p == 0_n); + } + + TEST_CASE("positive_int > nonnegative_int") { + CHECK_FALSE(4_p > 5_n); + CHECK_FALSE(7_p > 7_n); + CHECK(3_p > 2_n); + CHECK(1_p > 0_n); + } + + TEST_CASE("positive_int <= nonnegative_int") { + CHECK(4_p <= 5_n); + CHECK(7_p <= 7_n); + CHECK_FALSE(3_p <= 2_n); + CHECK_FALSE(1_p <= 0_n); + } + + TEST_CASE("positive_int != nonnegative_int") { + CHECK(4_p != 5_n); + CHECK_FALSE(7_p != 7_n); + CHECK(3_p != 2_n); + CHECK(1_p != 0_n); + } + + TEST_CASE("positive_int >= nonnegative_int") { + CHECK_FALSE(4_p >= 5_n); + CHECK(7_p >= 7_n); + CHECK(3_p >= 2_n); + CHECK(1_p >= 0_n); + } + + TEST_CASE("nonnegative_int < positive_int") { + CHECK(4_n < 5_p); + CHECK_FALSE(7_n < 7_p); + CHECK_FALSE(3_n < 2_p); + CHECK(0_n < 1_p); + } + + TEST_CASE("nonnegative_int == positive_int") { + CHECK_FALSE(4_n == 5_p); + CHECK(7_n == 7_p); + CHECK_FALSE(3_n == 2_p); + CHECK_FALSE(0_n == 1_p); + } + + TEST_CASE("nonnegative_int > positive_int") { + CHECK_FALSE(4_n > 5_p); + CHECK_FALSE(7_n > 7_p); + CHECK(3_n > 2_p); + CHECK_FALSE(0_n > 1_p); + } + + TEST_CASE("nonnegative_int <= positive_int") { + CHECK(4_n <= 5_p); + CHECK(7_n <= 7_p); + CHECK_FALSE(3_n <= 2_p); + CHECK(0_n <= 1_p); + } + + TEST_CASE("nonnegative_int != positive_int") { + CHECK(4_n != 5_p); + CHECK_FALSE(7_n != 7_p); + CHECK(3_n != 2_p); + CHECK(0_n != 1_p); + } + + TEST_CASE("nonnegative_int >= positive_int") { + CHECK_FALSE(4_n >= 5_p); + CHECK(7_n >= 7_p); + CHECK(3_n >= 2_p); + CHECK_FALSE(0_n >= 1_p); + } + + TEST_CASE("positive_int < int") { + CHECK(4_p < 5); + CHECK_FALSE(7_p < 7); + CHECK_FALSE(3_p < 2); + CHECK_FALSE(1_p < -3); + } + + TEST_CASE("positive_int == int") { + CHECK_FALSE(4_p == 5); + CHECK(7_p == 7); + CHECK_FALSE(3_p == 2); + CHECK_FALSE(1_p == -3); + } + + TEST_CASE("positive_int > int") { + CHECK_FALSE(4_p > 5); + CHECK_FALSE(7_p > 7); + CHECK(3_p > 2); + CHECK(1_p > -3); + } + + TEST_CASE("positive_int <= int") { + CHECK(4_p <= 5); + CHECK(7_p <= 7); + CHECK_FALSE(3_p <= 2); + CHECK_FALSE(1_p <= -3); + } + + TEST_CASE("positive_int != int") { + CHECK(4_p != 5); + CHECK_FALSE(7_p != 7); + CHECK(3_p != 2); + CHECK(1_p != -3); + } + + TEST_CASE("positive_int >= int") { + CHECK_FALSE(4_p >= 5); + CHECK(7_p >= 7); + CHECK(3_p >= 2); + CHECK(1_p >= -3); + } + + TEST_CASE("int < positive_int") { + CHECK(4 < 5_p); + CHECK_FALSE(7 < 7_p); + CHECK_FALSE(3 < 2_p); + CHECK(-3 < 1_p); + } + + TEST_CASE("int == positive_int") { + CHECK_FALSE(4 == 5_p); + CHECK(7 == 7_p); + CHECK_FALSE(3 == 2_p); + CHECK_FALSE(-3 == 1_p); + } + + TEST_CASE("int > positive_int") { + CHECK_FALSE(4 > 5_p); + CHECK_FALSE(7 > 7_p); + CHECK(3 > 2_p); + CHECK_FALSE(-3 > 1_p); + } + + TEST_CASE("int <= positive_int") { + CHECK(4 <= 5_p); + CHECK(7 <= 7_p); + CHECK_FALSE(3 <= 2_p); + CHECK(-3 <= 1_p); + } + + TEST_CASE("int != positive_int") { + CHECK(4 != 5_p); + CHECK_FALSE(7 != 7_p); + CHECK(3 != 2_p); + CHECK(-3 != 1_p); + } + + TEST_CASE("int >= positive_int") { + CHECK_FALSE(4 >= 5_p); + CHECK(7 >= 7_p); + CHECK(3 >= 2_p); + CHECK_FALSE(-3 >= 1_p); + } + + TEST_CASE("positive_int + positive_int") { + CHECK(4_p + 2_p == 6_p); + } + + TEST_CASE("positive_int + nonnegative_int") { + CHECK(4_p + 3_n == 7_p); + } + + TEST_CASE("++positive_int") { + positive_int x = 3_p; + CHECK(++x == 4_p); + CHECK(x == 4_p); + } + + TEST_CASE("positive_int++") { + positive_int x = 3_p; + CHECK(x++ == 3_p); + CHECK(x == 4_p); + } + + TEST_CASE("positive_int += positive_int ") { + positive_int x = 3_p; + + SUBCASE("single application") { + CHECK((x += 2_p) == 5_p); + CHECK(x == 5_p); + } + + SUBCASE("repeated application") { + CHECK(((x += 2_p) += 4_p) == 9_p); + CHECK(x == 9_p); + } + } + + TEST_CASE("positive_int += nonnegative_int") { + positive_int x = 3_p; + + SUBCASE("rhs is positive") { + CHECK((x += 2_n) == 5_p); + CHECK(x == 5_p); + } + + SUBCASE("rhs is zero") { + CHECK((x += 0_n) == 3_p); + CHECK(x == 3_p); + } + + SUBCASE("repeated application") { + CHECK(((x += 2_n) += 4_n) == 9_p); + CHECK(x == 9_p); + } + } + + TEST_CASE("positive_int * positive_int") { + CHECK(3_p * 4_p == 12_p); + } + + TEST_CASE("positive_int *= positive_int") { + positive_int x = 5_p; + + SUBCASE("single application") { + CHECK((x *= 2_p) == 10_p); + CHECK(x == 10_p); + } + + SUBCASE("repeated application") { + CHECK(((x *= 2_p) *= 3_p) == 30_p); + CHECK(x == 30_p); + } + } + + TEST_CASE("positive_int * nonnegative_int") { + CHECK(3_p * 4_n == 12_n); + CHECK(3_p * 0_n == 0_n); + } + + TEST_CASE("positive_int / positive_int") { + CHECK(4_p / 2_p == 2_n); + CHECK(4_p / 3_p == 1_n); + CHECK(4_p / 4_p == 1_n); + CHECK(4_p / 5_p == 0_n); + } + + TEST_CASE("nonnegative_int / positive_int") { + CHECK(4_n / 2_p == 2_n); + CHECK(4_n / 3_p == 1_n); + CHECK(4_n / 4_p == 1_n); + CHECK(4_n / 5_p == 0_n); + + CHECK(0_n / 1_p == 0_n); + } + + TEST_CASE("float / positive_int") { + CHECK(4.0f / 2_p == 2.0f); + CHECK(3.0f / 2_p == 1.5f); + CHECK(-3.0f / 4_p == -0.75f); + CHECK(0.0f / 1_p == 0.0f); + } + + TEST_CASE("float /= positive_int") { + SUBCASE("divides evenly") { + float x = 4.0f; + CHECK((x /= 2_p) == 2.0f); + CHECK(x == 2.0f); + } + + SUBCASE("does not divide evenly") { + float x = 3.0f; + CHECK((x /= 2_p) == 1.5f); + CHECK(x == 1.5f); + } + + SUBCASE("numerator is negative") { + float x = -3.0f; + CHECK((x /= 4_p) == -0.75f); + CHECK(x == -0.75f); + } + + SUBCASE("numerator is zero") { + float x = 0.0f; + CHECK((x /= 4_p) == 0.0f); + CHECK(x == 0.0f); + } + + SUBCASE("repeated /=") { + float x = 20.0f; + int x2 = 20; + CHECK(((x /= 4_p) /= 2_p) == 2.5f); + CHECK(x == 2.5f); + } + } + + TEST_CASE("positive_int % positive_int") { + CHECK(4_p % 3_p == 1_n); + CHECK(5_p % 5_p == 0_n); + } + + TEST_CASE("nonnegative_int % positive_int") { + CHECK(4_n % 3_p == 1_n); + CHECK(5_n % 5_p == 0_n); + CHECK(0_n % 3_p == 0_n); + } + + TEST_CASE("positive_int::int_from_positive_int()") { + CHECK((3_p).int_from_positive_int() == 3); + } + + TEST_CASE("positive_int::nonnegative_int_from_positive_int()") { + CHECK((4_p).nonnegative_int_from_positive_int() == 4); + } + + TEST_CASE("positive_int::operator<<(std::ostream &, positive_int)") { + std::ostringstream oss; + oss << 3_p; + + std::string result = oss.str(); + std::string correct = "3"; + + CHECK(result == correct); + } + + TEST_CASE("positive_int fmt support") { + std::string result = fmt::to_string(14_p); + std::string correct = "14"; + + CHECK(result == correct); + } + + TEST_CASE("adl_serializer") { + SUBCASE("to_json") { + positive_int input = 5_p; + + nlohmann::json result = input; + nlohmann::json correct = 5; + + CHECK(result == correct); + } + + SUBCASE("from_json") { + nlohmann::json input = 5; + + positive_int result = input.template get(); + positive_int correct = 5_p; + + CHECK(result == correct); + } + } + + TEST_CASE("std::hash") { + positive_int nn_int_1a = positive_int{1}; + positive_int nn_int_1b = positive_int{1}; + positive_int nn_int_2 = positive_int{2}; + std::hash hash_fn; + + SUBCASE("Identical values have the same hash") { + CHECK(hash_fn(nn_int_1a) == hash_fn(nn_int_1b)); + } + + SUBCASE("Different values have different hashes") { + CHECK(hash_fn(nn_int_1a) != hash_fn(nn_int_2)); + } + + SUBCASE("unordered_set works with positive_int") { + std::unordered_set<::FlexFlow::positive_int> positive_int_set; + positive_int_set.insert(nn_int_1a); + positive_int_set.insert(nn_int_1b); + positive_int_set.insert(nn_int_2); + + CHECK(positive_int_set.size() == 2); + } + } + + TEST_CASE("rc::Arbitrary") { + RC_SUBCASE([](positive_int) { }); } } From 9d4f90b54a0929b482aa33bd3032ee1026188063 Mon Sep 17 00:00:00 2001 From: fruitea Date: Mon, 26 May 2025 12:57:48 -0700 Subject: [PATCH 77/91] test: realm backend add e2e test --- .../local-execution/model_training_instance.h | 1 + .../src/model_training_instance.cc | 10 ++ lib/local-execution/test/src/test_e2e.cc | 140 +++++++++++++++++ .../test/src/test_local_cost_estimator.cc | 4 +- .../test/src/test_loss_functions.cc | 2 +- .../test/src/test_task_registry.cc | 2 +- lib/local-execution/test/src/test_update.cc | 2 +- .../realm-backend/model_training_instance.h | 1 + .../src/model_training_instance.cc | 10 ++ lib/realm-backend/test/src/test_e2e.cc | 145 ++++++++++++++++++ 10 files changed, 312 insertions(+), 5 deletions(-) create mode 100644 lib/local-execution/test/src/test_e2e.cc create mode 100644 lib/realm-backend/test/src/test_e2e.cc diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h index b36b20ed04..54b76313ab 100644 --- a/lib/local-execution/include/local-execution/model_training_instance.h +++ b/lib/local-execution/include/local-execution/model_training_instance.h @@ -30,6 +30,7 @@ struct ModelTrainingInstance { PerLayerElapsedTime forward(); PerLayerElapsedTime backward(); void update(); + GenericTensorAccessorW get_loss_tensor_backing(); }; } // namespace FlexFlow diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc index d404221d88..f232011230 100644 --- a/lib/local-execution/src/model_training_instance.cc +++ b/lib/local-execution/src/model_training_instance.cc @@ -54,4 +54,14 @@ void ModelTrainingInstance::update() { get_optimizer_attrs_for_next_iter(this->optimizer_attrs); } +GenericTensorAccessorW ModelTrainingInstance::get_loss_tensor_backing() { + gradient_tensor_t loss_tensor = + this->training_backing.local_tensor_backing + .tensor_gradient_mapping.at(this->logit_tensor); + GenericTensorAccessorW loss_tensor_backing = + this->training_backing.local_tensor_backing.tensor_backings.at( + TensorTypeVariant{loss_tensor}); + return loss_tensor_backing; +} + } // namespace FlexFlow diff --git a/lib/local-execution/test/src/test_e2e.cc b/lib/local-execution/test/src/test_e2e.cc new file mode 100644 index 0000000000..05aaab0c88 --- /dev/null +++ b/lib/local-execution/test/src/test_e2e.cc @@ -0,0 +1,140 @@ +#include "kernels/local_cuda_allocator.h" +#include "kernels/managed_ff_stream.h" +#include "kernels/managed_per_device_ff_handle.h" +#include "local-execution/allocated_tensors.h" +#include "local-execution/local_training_backing.h" +#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" +#include "pcg/computation_graph.h" +#include "pcg/computation_graph_builder.h" +#include "pcg/optimizer_attrs.dtg.h" +#include "test_utils.h" +#include "utils/containers/get_only.h" +#include "local-execution/model_training_instance.h" +#include + +using namespace ::FlexFlow; + +bool did_loss_decrease(GenericTensorAccessorW const &first_epoch, GenericTensorAccessorW const & last_epoch) { + float* first_epoch_ptr = first_epoch.get_float_ptr(); + float* last_epoch_ptr = last_epoch.get_float_ptr(); + + int batch_size = first_epoch.shape.at(ff_dim_t{nonnegative_int{0}}).unwrap_nonnegative(); + for (int i = 0; i < batch_size; i++) { + if (first_epoch_ptr[i] < last_epoch_ptr[i]) { + return false; + } + } + + return true; +} + + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("E2ETest") { + // initialize runtime + ManagedFFStream managed_stream{}; + ManagedPerDeviceFFHandle managed_handle{}; + + Allocator allocator = create_local_cuda_memory_allocator(); + + // allocate label tensors + LossTensorSource loss_tensor_source; + loss_tensor_t label_tensor = + loss_tensor_source.new_loss_tensor(); + + nonnegative_int batch_size = 10_n; + nonnegative_int data_dim = 16_n; + nonnegative_int output_dim = 32_n; + + TensorShape output_tensor_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, output_dim}}, + DataType::FLOAT}; + + GenericTensorAccessorW label_tensor_backing = + allocator.allocate_tensor(output_tensor_shape); + AllocatedTensors allocated_tensors = AllocatedTensors{ + { + {TensorTypeVariant{label_tensor}, + label_tensor_backing}}, + {}, + {}}; + + // construct computation graph + ComputationGraph computation_graph = make_empty_computation_graph(); + + TensorShape input_tensor_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, data_dim}}, + DataType::FLOAT}; + + TensorShape weight_shape = TensorShape{ + TensorDims{FFOrdered{data_dim, output_dim}}, + DataType::FLOAT}; + + LayerAddedResult inputs_layer = + add_input_layer(computation_graph, input_tensor_shape); + + LayerAddedResult weights_layer = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{ + weight_shape, InitializerAttrs{GlorotNormalAttrs{0}}}}, + std::nullopt}, + {}, + {}); + + LayerAddedResult linear_operator = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim, + /*use_bias=*/false, + DataType::FLOAT, + Activation::RELU, + std::nullopt}}, + std::nullopt}, + inputs_layer.outputs, + weights_layer.outputs); + tensor_guid_t logit_tensor = get_only(linear_operator.outputs); + + RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ + DeviceSpecific::create(managed_handle.raw_handle()), + EnableProfiling::YES, + ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}}; + + // initialize training backing + LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}; + OptimizerAttrs optimizer_attrs = + OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, + /*momentum=*/0.9, + /*nesterov=*/false, + /*weight_decay=*/0.001}}; + + + GradientTensorSource gradient_tensor_source; + OptimizerTensorSource optimizer_tensor_source; + + LocalTrainingBacking local_training_backing = + LocalTrainingBacking{allocator, + allocated_tensors, + gradient_tensor_source, + optimizer_tensor_source, + computation_graph, + runtime_arg_config, + optimizer_attrs}; + + // begin training loop + ModelTrainingInstance model_training_instance = ModelTrainingInstance{ + allocator, local_training_backing, logit_tensor, label_tensor, loss_attrs, optimizer_attrs + }; + + int num_epochs = 10; + std::vector loss_values (num_epochs); + + for (int i = 0; i < num_epochs; i++) { + model_training_instance.forward(); + model_training_instance.backward(); + model_training_instance.update(); + loss_values[i] = model_training_instance.get_loss_tensor_backing(); + } + + // Assert that each sample in the batch has a lower loss in last epoch than the first epoch + CHECK(did_loss_decrease(loss_values[0], loss_values[num_epochs - 1])); + } +} \ No newline at end of file diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc index 30682c9a48..0fa841be20 100644 --- a/lib/local-execution/test/src/test_local_cost_estimator.cc +++ b/lib/local-execution/test/src/test_local_cost_estimator.cc @@ -9,8 +9,8 @@ using namespace ::FlexFlow; -TEST_SUITE(FF_CUDA_TEST_SUITE) { - TEST_CASE("Local Cost Estimator") { +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("LocalCostEstimator") { // local backing initialization ManagedPerDeviceFFHandle managed_handle{}; diff --git a/lib/local-execution/test/src/test_loss_functions.cc b/lib/local-execution/test/src/test_loss_functions.cc index 2bf138e204..ae76dcccf9 100644 --- a/lib/local-execution/test/src/test_loss_functions.cc +++ b/lib/local-execution/test/src/test_loss_functions.cc @@ -14,7 +14,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("Loss Functions") { + TEST_CASE("LossFunctions") { // initialize runtime ManagedFFStream managed_stream{}; ManagedPerDeviceFFHandle managed_handle{}; diff --git a/lib/local-execution/test/src/test_task_registry.cc b/lib/local-execution/test/src/test_task_registry.cc index dd4b6f5b44..16877b0e09 100644 --- a/lib/local-execution/test/src/test_task_registry.cc +++ b/lib/local-execution/test/src/test_task_registry.cc @@ -9,7 +9,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("Task Registry") { + TEST_CASE("TaskRegistry") { layer_guid_t layer_guid = layer_guid_t{Node{0}}; nonnegative_int embed_dim = 32_n; diff --git a/lib/local-execution/test/src/test_update.cc b/lib/local-execution/test/src/test_update.cc index 1f8684f38a..dcd9c025b3 100644 --- a/lib/local-execution/test/src/test_update.cc +++ b/lib/local-execution/test/src/test_update.cc @@ -12,7 +12,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("Execute Update") { + TEST_CASE("ExecuteUpdate") { // initialize runtime configs ManagedFFStream managed_stream{}; ManagedPerDeviceFFHandle managed_handle{}; diff --git a/lib/realm-backend/include/realm-backend/model_training_instance.h b/lib/realm-backend/include/realm-backend/model_training_instance.h index 6c92b1de4a..bc9c79dccf 100644 --- a/lib/realm-backend/include/realm-backend/model_training_instance.h +++ b/lib/realm-backend/include/realm-backend/model_training_instance.h @@ -28,6 +28,7 @@ struct ModelTrainingInstance { PerLayerElapsedTime forward(); PerLayerElapsedTime backward(); void update(); + GenericTensorAccessorW get_loss_tensor_backing(); }; } // namespace FlexFlow diff --git a/lib/realm-backend/src/model_training_instance.cc b/lib/realm-backend/src/model_training_instance.cc index 8ced02e95a..d420776e42 100644 --- a/lib/realm-backend/src/model_training_instance.cc +++ b/lib/realm-backend/src/model_training_instance.cc @@ -73,4 +73,14 @@ void ModelTrainingInstance::update() { this->optimizer_attrs); } +GenericTensorAccessorW ModelTrainingInstance::get_loss_tensor_backing() { + gradient_tensor_t loss_tensor = + this->training_backing.realm_tensor_backing + .tensor_gradient_mapping.at(this->logit_tensor); + GenericTensorAccessorW loss_tensor_backing = + this->training_backing.realm_tensor_backing.tensor_backings.at( + TensorTypeVariant{loss_tensor}); + return loss_tensor_backing; +} + } // namespace FlexFlow diff --git a/lib/realm-backend/test/src/test_e2e.cc b/lib/realm-backend/test/src/test_e2e.cc new file mode 100644 index 0000000000..040b268128 --- /dev/null +++ b/lib/realm-backend/test/src/test_e2e.cc @@ -0,0 +1,145 @@ +#include "kernels/local_cuda_allocator.h" +#include "kernels/managed_ff_stream.h" +#include "kernels/managed_per_device_ff_handle.h" +#include "local-execution/allocated_tensors.h" +#include "realm-backend/realm_allocator.h" +#include "realm-backend/realm_training_backing.h" +#include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" +#include "pcg/computation_graph.h" +#include "pcg/computation_graph_builder.h" +#include "pcg/optimizer_attrs.dtg.h" +#include "test_utils.h" +#include "utils/containers/get_only.h" +#include "realm-backend/model_training_instance.h" +#include + +using namespace ::FlexFlow; + +bool did_loss_decrease(GenericTensorAccessorW const &first_epoch, GenericTensorAccessorW const & last_epoch) { + float* first_epoch_ptr = first_epoch.get_float_ptr(); + float* last_epoch_ptr = last_epoch.get_float_ptr(); + + int batch_size = first_epoch.shape.at(ff_dim_t{nonnegative_int{0}}).unwrap_nonnegative(); + for (int i = 0; i < batch_size; i++) { + if (first_epoch_ptr[i] < last_epoch_ptr[i]) { + return false; + } + } + + return true; +} + +void top_level_task(const void *args, size_t arglen, const void *userdata, + size_t userlen, Realm::Processor p) { + // initialize runtime + ManagedFFStream managed_stream{}; + ManagedPerDeviceFFHandle managed_handle{}; + std::vector worker_procs; + std::vector allocators; + Machine::ProcessorQuery pq = Machine::ProcessorQuery(Machine::get_machine()) + .only_kind(Processor::TOC_PROC); + assert(pq.count() > 0); + for (Processor p : pq) { + worker_procs.push_back(p); + allocators.push_back(create_realm_memory_allocator(p)); + } + + // allocate label tensors + LossTensorSource loss_tensor_source; + loss_tensor_t label_tensor = + loss_tensor_source.new_loss_tensor(); + + nonnegative_int batch_size = 10_n; + nonnegative_int data_dim = 16_n; + nonnegative_int output_dim = 32_n; + + TensorShape output_tensor_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, output_dim}}, + DataType::FLOAT}; + + GenericTensorAccessorW label_tensor_backing = + allocator.allocate_tensor(output_tensor_shape); + AllocatedTensors allocated_tensors = AllocatedTensors{ + { + {TensorTypeVariant{label_tensor}, + label_tensor_backing}}, + {}, + {}}; + + // construct computation graph + ComputationGraph computation_graph = make_empty_computation_graph(); + + TensorShape input_tensor_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, data_dim}}, + DataType::FLOAT}; + + TensorShape weight_shape = TensorShape{ + TensorDims{FFOrdered{data_dim, output_dim}}, + DataType::FLOAT}; + + LayerAddedResult inputs_layer = + add_input_layer(computation_graph, input_tensor_shape); + + LayerAddedResult weights_layer = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{ + weight_shape, InitializerAttrs{GlorotNormalAttrs{0}}}}, + std::nullopt}, + {}, + {}); + + LayerAddedResult linear_operator = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim, + /*use_bias=*/false, + DataType::FLOAT, + Activation::RELU, + std::nullopt}}, + std::nullopt}, + inputs_layer.outputs, + weights_layer.outputs); + tensor_guid_t logit_tensor = get_only(linear_operator.outputs); + + RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ + DeviceSpecific::create(managed_handle.raw_handle()), + EnableProfiling::YES, + ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}}; + + // initialize training backing + LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}; + OptimizerAttrs optimizer_attrs = + OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, + /*momentum=*/0.9, + /*nesterov=*/false, + /*weight_decay=*/0.001}}; + + + GradientTensorSource gradient_tensor_source; + OptimizerTensorSource optimizer_tensor_source; + + { + printf("\nRunning test %d: E2ETest...\n", 1); + RealmTrainingBacking realm_training_backing = RealmTrainingBacking( + p, worker_procs, allocators, allocated_tensors, gradient_tensor_source, + optimizer_tensor_source, computation_graph, runtime_arg_config, + optimizer_attrs); + // begin training loop + ModelTrainingInstance model_training_instance = ModelTrainingInstance{ + allocator, realm_training_backing, logit_tensor, label_tensor, loss_attrs, optimizer_attrs + }; + + int num_epochs = 10; + std::vector loss_values (num_epochs); + + for (int i = 0; i < num_epochs; i++) { + model_training_instance.forward(); + model_training_instance.backward(); + model_training_instance.update(); + loss_values[i] = model_training_instance.get_loss_tensor_backing(); + } + // Assert that each sample in the batch has a lower loss in last epoch than the first epoch + CHECK(did_loss_decrease(loss_values[0], loss_values[num_epochs - 1])); + printf("passed\n"); + } + } +} \ No newline at end of file From f3e2a27555e8ad79727552363be75a45944de890 Mon Sep 17 00:00:00 2001 From: fruitea Date: Tue, 27 May 2025 11:19:44 -0700 Subject: [PATCH 78/91] tweak: minor --- lib/realm-backend/test/src/test_e2e.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/realm-backend/test/src/test_e2e.cc b/lib/realm-backend/test/src/test_e2e.cc index 040b268128..659b0e5977 100644 --- a/lib/realm-backend/test/src/test_e2e.cc +++ b/lib/realm-backend/test/src/test_e2e.cc @@ -1,4 +1,3 @@ -#include "kernels/local_cuda_allocator.h" #include "kernels/managed_ff_stream.h" #include "kernels/managed_per_device_ff_handle.h" #include "local-execution/allocated_tensors.h" @@ -14,6 +13,7 @@ #include using namespace ::FlexFlow; +using namespace Realm; bool did_loss_decrease(GenericTensorAccessorW const &first_epoch, GenericTensorAccessorW const & last_epoch) { float* first_epoch_ptr = first_epoch.get_float_ptr(); @@ -58,7 +58,7 @@ void top_level_task(const void *args, size_t arglen, const void *userdata, DataType::FLOAT}; GenericTensorAccessorW label_tensor_backing = - allocator.allocate_tensor(output_tensor_shape); + allocators[0].allocate_tensor(output_tensor_shape); AllocatedTensors allocated_tensors = AllocatedTensors{ { {TensorTypeVariant{label_tensor}, @@ -125,7 +125,7 @@ void top_level_task(const void *args, size_t arglen, const void *userdata, optimizer_attrs); // begin training loop ModelTrainingInstance model_training_instance = ModelTrainingInstance{ - allocator, realm_training_backing, logit_tensor, label_tensor, loss_attrs, optimizer_attrs + realm_training_backing, logit_tensor, label_tensor, loss_attrs, optimizer_attrs }; int num_epochs = 10; From ba85fe4c022622a88fa230a0d0446c3607921cd9 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 30 Apr 2025 15:32:05 -0700 Subject: [PATCH 79/91] Pass cost estimator test --- .../include/local-execution/task_argument_accessor.h | 8 +++++++- lib/local-execution/src/local_cost_estimator.cc | 12 ++++++------ lib/local-execution/src/local_training_backing.cc | 7 ++++--- .../test/src/test_local_cost_estimator.cc | 2 +- .../src/per_device_op_state.cc} | 0 5 files changed, 18 insertions(+), 11 deletions(-) rename lib/{local-execution/src/per_device_state.cc => task-spec/src/per_device_op_state.cc} (100%) diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h index 99c1c1296b..285b41991a 100644 --- a/lib/local-execution/include/local-execution/task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/task_argument_accessor.h @@ -14,7 +14,13 @@ struct TaskArgumentAccessor { if constexpr (PerDeviceOpState::IsPartOfPerDeviceOpState_v) { PerDeviceOpState device_states = this->ptr->get_concrete_arg(slot).get(); - return device_states.get(); + if (device_states.has()) { + return device_states.get(); + } else { + throw mk_runtime_error( + fmt::format("Invalid access to PerDeviceOpState attempted, instead it holds: ", + device_states.index())); + } } else { return this->ptr->get_concrete_arg(slot).get(); } diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc index 532fcc91c2..0ee6c9a987 100644 --- a/lib/local-execution/src/local_cost_estimator.cc +++ b/lib/local-execution/src/local_cost_estimator.cc @@ -90,12 +90,12 @@ CostDetails LocalCostEstimator::estimate_cost( computation_graph, this->runtime_arg_config); // execute layer - layer_guid_t operator_layer_guid = - get_layer_by_name(computation_graph, "operator"); - float fwd = - execute_forward(local_backing, operator_layer_guid, allocator).value(); - float bwd = - execute_backward(local_backing, operator_layer_guid, allocator).value(); + layer_guid_t operator_layer_guid = get_layer_by_name(computation_graph, "operator"); + + float fwd = execute_forward(local_backing, operator_layer_guid, allocator).value(); + std::cout << "completed forward" << std::endl; + float bwd = execute_backward(local_backing, operator_layer_guid, allocator).value(); + std::cout << "completed backward" << std::endl; float total_execution_time = fwd + bwd; diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index b2e0a2fb7e..7d916715f5 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -104,8 +104,7 @@ std::optional call_task_impl(TaskRegistry const &task_registry, task_id_t const &task_id, TaskArgumentAccessor const &acc) { TaskSignatureAndImpl task_sig_impl = task_registry.task_mapping.at(task_id); - auto fn = - task_sig_impl.impl_function.get().function_ptr; + auto fn = task_sig_impl.impl_function.get().function_ptr; return fn(acc); } @@ -116,13 +115,15 @@ std::optional if (registry_contains_task_for_layer(local_training_backing.task_registry, operator_node, OpTaskType::FWD)) { + ComputationGraphOpAttrs attrs = get_layer_attrs(local_training_backing.computation_graph, operator_node) .op_attrs; - + std::optional device_state = get_per_device_op_state_if_exists( local_training_backing.local_args_backing, operator_node); + TaskInvocation invocation = lower_to_task_invocation( forward(attrs), operator_node, diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc index 0fa841be20..e493265f86 100644 --- a/lib/local-execution/test/src/test_local_cost_estimator.cc +++ b/lib/local-execution/test/src/test_local_cost_estimator.cc @@ -31,7 +31,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*kdim=*/embed_dim, /*vdim=*/embed_dim, /*dropout=*/0.0, - /*bias=*/true, + /*bias=*/false, /*add_bias_kv=*/false, /*add_zero_attn=*/false, }; diff --git a/lib/local-execution/src/per_device_state.cc b/lib/task-spec/src/per_device_op_state.cc similarity index 100% rename from lib/local-execution/src/per_device_state.cc rename to lib/task-spec/src/per_device_op_state.cc From ed0a164042c86dd063d40316271b374ef422215b Mon Sep 17 00:00:00 2001 From: fruitea Date: Wed, 28 May 2025 02:10:09 -0700 Subject: [PATCH 80/91] feat: fix e2e test --- lib/kernels/include/kernels/accessor.h | 1 + .../kernels/managed_per_device_ff_handle.h | 6 +- lib/kernels/src/accessor.cc | 10 + .../src/managed_per_device_ff_handle.cc | 22 +- lib/kernels/test/src/test_attention_kernel.cc | 2 +- .../test/src/test_batch_matmul_kernel.cc | 2 +- .../test/src/test_batch_norm_kernel.cc | 2 +- lib/kernels/test/src/test_combine_kernel.cc | 2 +- lib/kernels/test/src/test_concat_kernel.cc | 2 +- lib/kernels/test/src/test_dropout.cc | 2 +- lib/kernels/test/src/test_flat_kernel.cc | 2 +- lib/kernels/test/src/test_gather_kernels.cc | 2 +- .../test/src/test_layer_norm_kernels.cc | 2 +- lib/kernels/test/src/test_partition_kernel.cc | 2 +- lib/kernels/test/src/test_pool_2d_kernels.cc | 2 +- lib/kernels/test/src/test_reduction_kernel.cc | 2 +- lib/kernels/test/src/test_replicate_kernel.cc | 2 +- lib/kernels/test/src/test_reshape_kernel.cc | 2 +- lib/kernels/test/src/test_reverse_kernels.cc | 2 +- lib/kernels/test/src/test_softmax_kernel.cc | 2 +- lib/kernels/test/src/test_split_kernel.cc | 2 +- lib/kernels/test/src/test_transpose_kernel.cc | 2 +- .../local-execution/model_training_instance.h | 2 +- .../local-execution/task_argument_accessor.h | 6 +- .../src/local-execution/ops/linear.cc | 6 +- .../src/local_cost_estimator.cc | 11 +- .../src/local_training_backing.cc | 9 +- .../src/model_training_instance.cc | 4 +- lib/local-execution/src/optimizer.cc | 2 +- lib/local-execution/test/src/test_e2e.cc | 122 +++++---- .../test/src/test_local_cost_estimator.cc | 2 +- .../test/src/test_loss_functions.cc | 2 +- lib/local-execution/test/src/test_update.cc | 2 +- lib/pcg/include/pcg/computation_graph.h | 2 + lib/pcg/src/pcg/computation_graph.cc | 14 + .../realm-backend/model_training_instance.h | 2 +- .../include/realm-backend/task_wrapper.h | 6 +- .../src/model_training_instance.cc | 4 +- .../src/realm_training_backing.cc | 4 +- lib/realm-backend/src/task_wrapper.cc | 32 ++- lib/realm-backend/test/src/test_e2e.cc | 255 ++++++++++-------- lib/realm-backend/test/src/test_update.cc | 2 +- 42 files changed, 341 insertions(+), 223 deletions(-) diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index 39da65c3be..55b120b090 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -75,6 +75,7 @@ std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &); int32_t *get_int32_ptr(GenericTensorAccessorW const &); int64_t *get_int64_ptr(GenericTensorAccessorW const &); float *get_float_ptr(GenericTensorAccessorW const &); +void write_to_host_float_ptr(GenericTensorAccessorW const &, float *); double *get_double_ptr(GenericTensorAccessorW const &); half *get_half_ptr(GenericTensorAccessorW const &); std::vector diff --git a/lib/kernels/include/kernels/managed_per_device_ff_handle.h b/lib/kernels/include/kernels/managed_per_device_ff_handle.h index 0a83a5eecb..05e8406de8 100644 --- a/lib/kernels/include/kernels/managed_per_device_ff_handle.h +++ b/lib/kernels/include/kernels/managed_per_device_ff_handle.h @@ -7,7 +7,7 @@ namespace FlexFlow { struct ManagedPerDeviceFFHandle { public: - ManagedPerDeviceFFHandle(); + ManagedPerDeviceFFHandle(int num_ranks, int my_rank); ManagedPerDeviceFFHandle(ManagedPerDeviceFFHandle const &) = delete; ManagedPerDeviceFFHandle & @@ -25,6 +25,10 @@ struct ManagedPerDeviceFFHandle { PerDeviceFFHandle *handle; }; +ManagedPerDeviceFFHandle initialize_single_gpu_handle(); +ManagedPerDeviceFFHandle initialize_multi_gpu_handle(int num_ranks, + int my_rank); + } // namespace FlexFlow #endif diff --git a/lib/kernels/src/accessor.cc b/lib/kernels/src/accessor.cc index 27b7eb390d..7f4f61c271 100644 --- a/lib/kernels/src/accessor.cc +++ b/lib/kernels/src/accessor.cc @@ -1,4 +1,5 @@ #include "kernels/accessor.h" +#include "device.h" namespace FlexFlow { @@ -76,6 +77,15 @@ float *get_float_ptr(GenericTensorAccessorW const &a) { return get(a); } +void write_to_host_float_ptr(GenericTensorAccessorW const &a, float *host_ptr) { + float *device_ptr = get(a); + int total_elements = get_volume(a.shape).unwrap_nonnegative(); + checkCUDA(cudaMemcpy(host_ptr, + device_ptr, + total_elements * sizeof(float), + cudaMemcpyDeviceToHost)); +} + double *get_double_ptr(GenericTensorAccessorW const &a) { return get(a); } diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc index c050e887b6..e327a7b1e1 100644 --- a/lib/kernels/src/managed_per_device_ff_handle.cc +++ b/lib/kernels/src/managed_per_device_ff_handle.cc @@ -1,9 +1,10 @@ #include "kernels/managed_per_device_ff_handle.h" #include "device.h" +#include "kernels/nccl.h" namespace FlexFlow { -ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle() { +ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle(int num_ranks, int my_rank) { handle = new PerDeviceFFHandle; handle->workSpaceSize = 1024 * 1024; handle->allowTensorOpMathConversion = true; @@ -11,6 +12,13 @@ ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle() { checkCUDNN(cudnnCreate(&handle->dnn)); checkCUBLAS(cublasCreate(&handle->blas)); checkCUDA(cudaMalloc(&handle->workSpace, handle->workSpaceSize)); + +#ifdef FF_USE_NCCL + ncclUniqueId ncclId; + checkNCCL(ncclGetUniqueId(&ncclId)); + checkNCCL(ncclCommInitRank( + &handle->ncclComm, num_ranks, ncclId, my_rank)); // todo generalize +#endif } ManagedPerDeviceFFHandle::ManagedPerDeviceFFHandle( @@ -28,6 +36,9 @@ ManagedPerDeviceFFHandle::~ManagedPerDeviceFFHandle() { checkCUDNN(cudnnDestroy(handle->dnn)); checkCUBLAS(cublasDestroy(handle->blas)); checkCUDA(cudaFree(handle->workSpace)); +#ifdef FF_USE_NCCL + checkNCCL(ncclCommDestroy(handle->ncclComm)); +#endif delete handle; } } @@ -36,4 +47,13 @@ PerDeviceFFHandle const &ManagedPerDeviceFFHandle::raw_handle() const { return *handle; } +ManagedPerDeviceFFHandle initialize_single_gpu_handle() { + return ManagedPerDeviceFFHandle(1, 0); +} + +ManagedPerDeviceFFHandle initialize_multi_gpu_handle(int num_ranks, + int my_rank) { + return ManagedPerDeviceFFHandle(num_ranks, my_rank); +} + } // namespace FlexFlow diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc index 64264f6c39..a15497984c 100644 --- a/lib/kernels/test/src/test_attention_kernel.cc +++ b/lib/kernels/test/src/test_attention_kernel.cc @@ -19,7 +19,7 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int kvSeqLength = 20_n; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc index cacd5b60fb..b9cfbf3ec5 100644 --- a/lib/kernels/test/src/test_batch_matmul_kernel.cc +++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc @@ -15,7 +15,7 @@ TEST_SUITE(FF_TEST_SUITE) { int seq_length = -1; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index b4c43cf1d8..94ce268b93 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -12,7 +12,7 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int output_w = 10_n; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc index 2b6b9bf589..68f35cb099 100644 --- a/lib/kernels/test/src/test_combine_kernel.cc +++ b/lib/kernels/test/src/test_combine_kernel.cc @@ -5,7 +5,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test combine kernel") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc index 215e599716..ca6b95dadc 100644 --- a/lib/kernels/test/src/test_concat_kernel.cc +++ b/lib/kernels/test/src/test_concat_kernel.cc @@ -10,7 +10,7 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int size_per_input = 100_n; ff_dim_t concat_axis = ff_dim_t{0_n}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); ManagedFFStream managed_stream{}; TensorShape input_shape = diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index 86f8f2102b..7e78544df8 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -18,7 +18,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape output_shape = input_shape; ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc index 83f7f0445e..c9e1778843 100644 --- a/lib/kernels/test/src/test_flat_kernel.cc +++ b/lib/kernels/test/src/test_flat_kernel.cc @@ -7,7 +7,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Flat Kernel") { Allocator allocator = create_local_cuda_memory_allocator(); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); ManagedFFStream managed_stream{}; TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc index 1a8cf5f82a..ffe8e0dfd2 100644 --- a/lib/kernels/test/src/test_gather_kernels.cc +++ b/lib/kernels/test/src/test_gather_kernels.cc @@ -5,7 +5,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Gather Forward and Backward Kernel") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc index 5386c1d943..9e89c86433 100644 --- a/lib/kernels/test/src/test_layer_norm_kernels.cc +++ b/lib/kernels/test/src/test_layer_norm_kernels.cc @@ -17,7 +17,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape feature_shape = make_float_tensor_shape_from_legion_dims({feature_size}); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc index 4fd1b53210..281a146a30 100644 --- a/lib/kernels/test/src/test_partition_kernel.cc +++ b/lib/kernels/test/src/test_partition_kernel.cc @@ -6,7 +6,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Partition Forward and Backward") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index 62b61707c6..874e2b8d98 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -22,7 +22,7 @@ TEST_SUITE(FF_TEST_SUITE) { PoolOp pool_type = PoolOp::MAX; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc index 04a3817b84..7f993c12d3 100644 --- a/lib/kernels/test/src/test_reduction_kernel.cc +++ b/lib/kernels/test/src/test_reduction_kernel.cc @@ -10,7 +10,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = make_float_tensor_shape_from_legion_dims( {10_n, 10_n, 10_n, 10_n, 10_n}); - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index fa726898f2..8c47c2a49a 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -10,7 +10,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); TensorShape output_shape = input_shape; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc index d329a347b3..1e969f6d82 100644 --- a/lib/kernels/test/src/test_reshape_kernel.cc +++ b/lib/kernels/test/src/test_reshape_kernel.cc @@ -5,7 +5,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("Test Reshape Forward and Backward") { - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index 9c8475f6d6..ba808c491a 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -12,7 +12,7 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n}); TensorShape output_shape = input_shape; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index c9eaa76b86..cba293aed1 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -12,7 +12,7 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int input_w = 100_n; nonnegative_int channels = 100_n; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index ea0d280f68..65d1ed7783 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -12,7 +12,7 @@ TEST_SUITE(FF_TEST_SUITE) { coord_t in_blk_size = 100; coord_t num_blks = 1; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index 02d99c86a1..f7007d76e4 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -12,7 +12,7 @@ TEST_SUITE(FF_TEST_SUITE) { }, }; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); ManagedFFStream managed_stream{}; Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h index 54b76313ab..2deed6b0a2 100644 --- a/lib/local-execution/include/local-execution/model_training_instance.h +++ b/lib/local-execution/include/local-execution/model_training_instance.h @@ -30,7 +30,7 @@ struct ModelTrainingInstance { PerLayerElapsedTime forward(); PerLayerElapsedTime backward(); void update(); - GenericTensorAccessorW get_loss_tensor_backing(); + void write_loss_tensor_to_host(float *host_ptr); }; } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h index 285b41991a..499b5ff7d6 100644 --- a/lib/local-execution/include/local-execution/task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/task_argument_accessor.h @@ -17,9 +17,9 @@ struct TaskArgumentAccessor { if (device_states.has()) { return device_states.get(); } else { - throw mk_runtime_error( - fmt::format("Invalid access to PerDeviceOpState attempted, instead it holds: ", - device_states.index())); + throw mk_runtime_error(fmt::format( + "Invalid access to PerDeviceOpState attempted, instead it holds: ", + device_states.index())); } } else { return this->ptr->get_concrete_arg(slot).get(); diff --git a/lib/local-execution/src/local-execution/ops/linear.cc b/lib/local-execution/src/local-execution/ops/linear.cc index 94f92d37ee..768293b32f 100644 --- a/lib/local-execution/src/local-execution/ops/linear.cc +++ b/lib/local-execution/src/local-execution/ops/linear.cc @@ -89,7 +89,6 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto weight = acc.get_tensor(WEIGHT); auto output = acc.get_tensor(OUTPUT); - auto bias = acc.get_tensor(BIAS); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); @@ -102,6 +101,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { float const *bias_ptr = NULL; if (attrs.use_bias) { + auto bias = acc.get_tensor(BIAS); bias_ptr = bias.get_float_ptr(); } @@ -118,14 +118,11 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { batch_size.unwrap_nonnegative()); } -; - static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto weight = acc.get_tensor(WEIGHT); auto output = acc.get_tensor(OUTPUT); - auto bias = acc.get_tensor(BIAS); auto input_grad = acc.get_tensor_grad(INPUT); auto weight_grad = acc.get_tensor_grad(WEIGHT); @@ -137,6 +134,7 @@ static std::optional float const *bias_ptr = NULL; if (attrs.use_bias) { + auto bias = acc.get_tensor(BIAS); bias_ptr = bias.get_float_ptr(); } diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc index 0ee6c9a987..0a84c19066 100644 --- a/lib/local-execution/src/local_cost_estimator.cc +++ b/lib/local-execution/src/local_cost_estimator.cc @@ -90,11 +90,14 @@ CostDetails LocalCostEstimator::estimate_cost( computation_graph, this->runtime_arg_config); // execute layer - layer_guid_t operator_layer_guid = get_layer_by_name(computation_graph, "operator"); - - float fwd = execute_forward(local_backing, operator_layer_guid, allocator).value(); + layer_guid_t operator_layer_guid = + get_layer_by_name(computation_graph, "operator"); + + float fwd = + execute_forward(local_backing, operator_layer_guid, allocator).value(); std::cout << "completed forward" << std::endl; - float bwd = execute_backward(local_backing, operator_layer_guid, allocator).value(); + float bwd = + execute_backward(local_backing, operator_layer_guid, allocator).value(); std::cout << "completed backward" << std::endl; float total_execution_time = fwd + bwd; diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index 7d916715f5..d508c34210 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -104,7 +104,8 @@ std::optional call_task_impl(TaskRegistry const &task_registry, task_id_t const &task_id, TaskArgumentAccessor const &acc) { TaskSignatureAndImpl task_sig_impl = task_registry.task_mapping.at(task_id); - auto fn = task_sig_impl.impl_function.get().function_ptr; + auto fn = + task_sig_impl.impl_function.get().function_ptr; return fn(acc); } @@ -115,15 +116,15 @@ std::optional if (registry_contains_task_for_layer(local_training_backing.task_registry, operator_node, OpTaskType::FWD)) { - + ComputationGraphOpAttrs attrs = get_layer_attrs(local_training_backing.computation_graph, operator_node) .op_attrs; - + std::optional device_state = get_per_device_op_state_if_exists( local_training_backing.local_args_backing, operator_node); - + TaskInvocation invocation = lower_to_task_invocation( forward(attrs), operator_node, diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc index f232011230..d214d0d426 100644 --- a/lib/local-execution/src/model_training_instance.cc +++ b/lib/local-execution/src/model_training_instance.cc @@ -54,14 +54,14 @@ void ModelTrainingInstance::update() { get_optimizer_attrs_for_next_iter(this->optimizer_attrs); } -GenericTensorAccessorW ModelTrainingInstance::get_loss_tensor_backing() { +void ModelTrainingInstance::write_loss_tensor_to_host(float *host_ptr) { gradient_tensor_t loss_tensor = this->training_backing.local_tensor_backing .tensor_gradient_mapping.at(this->logit_tensor); GenericTensorAccessorW loss_tensor_backing = this->training_backing.local_tensor_backing.tensor_backings.at( TensorTypeVariant{loss_tensor}); - return loss_tensor_backing; + write_to_host_float_ptr(loss_tensor_backing, host_ptr); } } // namespace FlexFlow diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc index 1b9ce83d14..1b8fc37b2d 100644 --- a/lib/local-execution/src/optimizer.cc +++ b/lib/local-execution/src/optimizer.cc @@ -70,7 +70,7 @@ static void sgd_update_task_impl(TaskArgumentAccessor const &acc) { int size = weight_grad.shape.get_volume().unwrap_nonnegative(); assert(weight_grad.shape.get_volume().unwrap_nonnegative() & - weight.shape.get_volume().unwrap_nonnegative() == 0); + weight.shape.get_volume().unwrap_nonnegative()); int num_replicas = weight_grad.shape.get_volume().unwrap_nonnegative() / weight.shape.get_volume().unwrap_nonnegative(); diff --git a/lib/local-execution/test/src/test_e2e.cc b/lib/local-execution/test/src/test_e2e.cc index 05aaab0c88..ccad60a900 100644 --- a/lib/local-execution/test/src/test_e2e.cc +++ b/lib/local-execution/test/src/test_e2e.cc @@ -3,48 +3,42 @@ #include "kernels/managed_per_device_ff_handle.h" #include "local-execution/allocated_tensors.h" #include "local-execution/local_training_backing.h" +#include "local-execution/model_training_instance.h" #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" #include "pcg/computation_graph.h" #include "pcg/computation_graph_builder.h" #include "pcg/optimizer_attrs.dtg.h" #include "test_utils.h" #include "utils/containers/get_only.h" -#include "local-execution/model_training_instance.h" #include using namespace ::FlexFlow; -bool did_loss_decrease(GenericTensorAccessorW const &first_epoch, GenericTensorAccessorW const & last_epoch) { - float* first_epoch_ptr = first_epoch.get_float_ptr(); - float* last_epoch_ptr = last_epoch.get_float_ptr(); - - int batch_size = first_epoch.shape.at(ff_dim_t{nonnegative_int{0}}).unwrap_nonnegative(); +bool did_loss_decrease(float *first_epoch, float *last_epoch, int batch_size) { for (int i = 0; i < batch_size; i++) { - if (first_epoch_ptr[i] < last_epoch_ptr[i]) { + if (first_epoch[i] < last_epoch[i]) { return false; } } - return true; } - TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("E2ETest") { // initialize runtime ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); Allocator allocator = create_local_cuda_memory_allocator(); // allocate label tensors LossTensorSource loss_tensor_source; - loss_tensor_t label_tensor = - loss_tensor_source.new_loss_tensor(); + loss_tensor_t label_tensor = loss_tensor_source.new_loss_tensor(); nonnegative_int batch_size = 10_n; nonnegative_int data_dim = 16_n; - nonnegative_int output_dim = 32_n; + nonnegative_int hidden_dim = 32_n; + nonnegative_int output_dim = 1_n; TensorShape output_tensor_shape = TensorShape{ TensorDims{FFOrdered{batch_size, output_dim}}, @@ -53,11 +47,7 @@ TEST_SUITE(FF_TEST_SUITE) { GenericTensorAccessorW label_tensor_backing = allocator.allocate_tensor(output_tensor_shape); AllocatedTensors allocated_tensors = AllocatedTensors{ - { - {TensorTypeVariant{label_tensor}, - label_tensor_backing}}, - {}, - {}}; + {{TensorTypeVariant{label_tensor}, label_tensor_backing}}, {}, {}}; // construct computation graph ComputationGraph computation_graph = make_empty_computation_graph(); @@ -66,32 +56,55 @@ TEST_SUITE(FF_TEST_SUITE) { TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; - TensorShape weight_shape = TensorShape{ - TensorDims{FFOrdered{data_dim, output_dim}}, + TensorShape weight_shape_1 = TensorShape{ + TensorDims{FFOrdered{data_dim, hidden_dim}}, + DataType::FLOAT}; + TensorShape weight_shape_2 = TensorShape{ + TensorDims{FFOrdered{hidden_dim, output_dim}}, DataType::FLOAT}; LayerAddedResult inputs_layer = - add_input_layer(computation_graph, input_tensor_shape); + add_input_layer_with_grad(computation_graph, input_tensor_shape); - LayerAddedResult weights_layer = add_layer( + LayerAddedResult weights_layer_1 = add_layer( computation_graph, LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{ - weight_shape, InitializerAttrs{GlorotNormalAttrs{0}}}}, + weight_shape_1, InitializerAttrs{GlorotNormalAttrs{0}}}}, std::nullopt}, {}, {}); - LayerAddedResult linear_operator = add_layer( + LayerAddedResult weights_layer_2 = add_layer( computation_graph, - LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim, + LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{ + weight_shape_2, InitializerAttrs{GlorotNormalAttrs{0}}}}, + std::nullopt}, + {}, + {}); + + LayerAddedResult linear_operator_1 = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{hidden_dim, /*use_bias=*/false, DataType::FLOAT, Activation::RELU, std::nullopt}}, std::nullopt}, inputs_layer.outputs, - weights_layer.outputs); - tensor_guid_t logit_tensor = get_only(linear_operator.outputs); + weights_layer_1.outputs); + + LayerAddedResult linear_operator_2 = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim, + /*use_bias=*/false, + DataType::FLOAT, + Activation::RELU, + std::nullopt}}, + std::nullopt}, + linear_operator_1.outputs, + weights_layer_2.outputs); + + tensor_guid_t logit_tensor = get_only(linear_operator_2.outputs); RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ DeviceSpecific::create(managed_handle.raw_handle()), @@ -99,42 +112,57 @@ TEST_SUITE(FF_TEST_SUITE) { ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}}; // initialize training backing - LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}; + LossAttrs loss_attrs = LossAttrs{ + NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}; OptimizerAttrs optimizer_attrs = OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, /*momentum=*/0.9, /*nesterov=*/false, /*weight_decay=*/0.001}}; - GradientTensorSource gradient_tensor_source; OptimizerTensorSource optimizer_tensor_source; LocalTrainingBacking local_training_backing = LocalTrainingBacking{allocator, - allocated_tensors, - gradient_tensor_source, - optimizer_tensor_source, - computation_graph, - runtime_arg_config, - optimizer_attrs}; - - // begin training loop - ModelTrainingInstance model_training_instance = ModelTrainingInstance{ - allocator, local_training_backing, logit_tensor, label_tensor, loss_attrs, optimizer_attrs - }; - - int num_epochs = 10; - std::vector loss_values (num_epochs); + allocated_tensors, + gradient_tensor_source, + optimizer_tensor_source, + computation_graph, + runtime_arg_config, + optimizer_attrs}; + + // begin training loop + ModelTrainingInstance model_training_instance = + ModelTrainingInstance{allocator, + local_training_backing, + logit_tensor, + label_tensor, + loss_attrs, + optimizer_attrs}; + + int num_epochs = 5; + int num_samples = batch_size.unwrap_nonnegative(); + std::vector loss_values(num_epochs); for (int i = 0; i < num_epochs; i++) { model_training_instance.forward(); model_training_instance.backward(); model_training_instance.update(); - loss_values[i] = model_training_instance.get_loss_tensor_backing(); + float *host_loss_ptr = new float[num_samples]; + model_training_instance.write_loss_tensor_to_host(host_loss_ptr); + loss_values[i] = host_loss_ptr; + } + + // Assert that each sample in the batch has a lower loss in last epoch than + // the first epoch + float *first_epoch = loss_values[0]; + float *last_epoch = loss_values[num_epochs - 1]; + CHECK(did_loss_decrease( + first_epoch, last_epoch, batch_size.unwrap_nonnegative())); + + for (int i = 0; i < num_epochs; i++) { + delete[] loss_values[i]; } - - // Assert that each sample in the batch has a lower loss in last epoch than the first epoch - CHECK(did_loss_decrease(loss_values[0], loss_values[num_epochs - 1])); } } \ No newline at end of file diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc index e493265f86..c9c5afe04e 100644 --- a/lib/local-execution/test/src/test_local_cost_estimator.cc +++ b/lib/local-execution/test/src/test_local_cost_estimator.cc @@ -12,7 +12,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("LocalCostEstimator") { // local backing initialization - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ DeviceSpecific::create(managed_handle.raw_handle()), diff --git a/lib/local-execution/test/src/test_loss_functions.cc b/lib/local-execution/test/src/test_loss_functions.cc index ae76dcccf9..ca2482653b 100644 --- a/lib/local-execution/test/src/test_loss_functions.cc +++ b/lib/local-execution/test/src/test_loss_functions.cc @@ -17,7 +17,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("LossFunctions") { // initialize runtime ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); Allocator allocator = create_local_cuda_memory_allocator(); diff --git a/lib/local-execution/test/src/test_update.cc b/lib/local-execution/test/src/test_update.cc index dcd9c025b3..75ba517d1b 100644 --- a/lib/local-execution/test/src/test_update.cc +++ b/lib/local-execution/test/src/test_update.cc @@ -15,7 +15,7 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("ExecuteUpdate") { // initialize runtime configs ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); Allocator allocator = create_local_cuda_memory_allocator(); AllocatedTensors allocated_tensors = make_empty_allocated_tensors(); diff --git a/lib/pcg/include/pcg/computation_graph.h b/lib/pcg/include/pcg/computation_graph.h index efc955ec92..60e825c11a 100644 --- a/lib/pcg/include/pcg/computation_graph.h +++ b/lib/pcg/include/pcg/computation_graph.h @@ -24,6 +24,8 @@ LayerAddedResult add_layer( LayerAddedResult add_input_layer(ComputationGraph &computation_graph, TensorShape const &tensor_shape); +LayerAddedResult add_input_layer_with_grad(ComputationGraph &computation_graph, + TensorShape const &tensor_shape); TensorAttrs get_tensor_attrs(ComputationGraph const &, tensor_guid_t const &); bool are_tensor_guid_shapes_equivalent(ComputationGraph const &cg, diff --git a/lib/pcg/src/pcg/computation_graph.cc b/lib/pcg/src/pcg/computation_graph.cc index 200410dd7b..b8917eed35 100644 --- a/lib/pcg/src/pcg/computation_graph.cc +++ b/lib/pcg/src/pcg/computation_graph.cc @@ -100,6 +100,20 @@ LayerAddedResult add_input_layer(ComputationGraph &cg, /*outputs=*/std::vector{CreateGrad::NO}); } +LayerAddedResult add_input_layer_with_grad(ComputationGraph &cg, + TensorShape const &tensor_shape) { + LayerAttrs layer_attrs = LayerAttrs{ + /*op_attrs=*/ComputationGraphOpAttrs{InputAttrs{tensor_shape}}, + /*name=*/std::nullopt, + }; + + return add_layer(cg, + layer_attrs, + /*inputs=*/{}, + /*weights=*/{}, + /*outputs=*/std::vector{CreateGrad::YES}); +} + TensorAttrs get_tensor_attrs(ComputationGraph const &cg, tensor_guid_t const &t) { return cg.raw_graph.at(t.raw_graph_output); diff --git a/lib/realm-backend/include/realm-backend/model_training_instance.h b/lib/realm-backend/include/realm-backend/model_training_instance.h index bc9c79dccf..049836d042 100644 --- a/lib/realm-backend/include/realm-backend/model_training_instance.h +++ b/lib/realm-backend/include/realm-backend/model_training_instance.h @@ -28,7 +28,7 @@ struct ModelTrainingInstance { PerLayerElapsedTime forward(); PerLayerElapsedTime backward(); void update(); - GenericTensorAccessorW get_loss_tensor_backing(); + void write_loss_tensor_to_host(float *host_ptr); }; } // namespace FlexFlow diff --git a/lib/realm-backend/include/realm-backend/task_wrapper.h b/lib/realm-backend/include/realm-backend/task_wrapper.h index 8265ca398b..64a360e549 100644 --- a/lib/realm-backend/include/realm-backend/task_wrapper.h +++ b/lib/realm-backend/include/realm-backend/task_wrapper.h @@ -25,11 +25,11 @@ void fwdbwd_wrapper_task(const void *args, size_t arglen, const void *userdata, void generic_wrapper_task(const void *args, size_t arglen, const void *userdata, size_t userlen, Realm::Processor p); -void register_wrapper_tasks_init(Realm::Processor p, task_id_t task_id); +void register_wrapper_tasks_init(int p_id, Realm::Processor p, task_id_t task_id); -void register_wrapper_tasks_fwdbwd(Realm::Processor p, task_id_t task_id); +void register_wrapper_tasks_fwdbwd(int p_id, Realm::Processor p, task_id_t task_id); -void register_wrapper_tasks_generic(Realm::Processor p, task_id_t task_id); +void register_wrapper_tasks_generic(int p_id, Realm::Processor p, task_id_t task_id); void register_wrapper_tasks(int pid, Realm::Processor p, task_id_t task_id, TaskSignatureAndImpl task_sig_impl); diff --git a/lib/realm-backend/src/model_training_instance.cc b/lib/realm-backend/src/model_training_instance.cc index d420776e42..0c318f8942 100644 --- a/lib/realm-backend/src/model_training_instance.cc +++ b/lib/realm-backend/src/model_training_instance.cc @@ -73,14 +73,14 @@ void ModelTrainingInstance::update() { this->optimizer_attrs); } -GenericTensorAccessorW ModelTrainingInstance::get_loss_tensor_backing() { +void ModelTrainingInstance::write_loss_tensor_to_host(float *host_ptr) { gradient_tensor_t loss_tensor = this->training_backing.realm_tensor_backing .tensor_gradient_mapping.at(this->logit_tensor); GenericTensorAccessorW loss_tensor_backing = this->training_backing.realm_tensor_backing.tensor_backings.at( TensorTypeVariant{loss_tensor}); - return loss_tensor_backing; + write_to_host_float_ptr(loss_tensor_backing, host_ptr); } } // namespace FlexFlow diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc index 3b7eb48823..e2e28e9929 100644 --- a/lib/realm-backend/src/realm_training_backing.cc +++ b/lib/realm-backend/src/realm_training_backing.cc @@ -276,7 +276,7 @@ Future execute_update(RealmTrainingBacking &realm_training_backing, realm_training_backing.realm_args_backing, invocation, realm_training_backing.allocators[0]); task_id_t task_id = invocation.task_id; - register_wrapper_tasks_generic(realm_training_backing.worker_procs[0], + register_wrapper_tasks_generic(0, realm_training_backing.worker_procs[0], task_id); TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs); // TODO: multi gpu launching @@ -311,7 +311,7 @@ Future compute_loss(RealmTrainingBacking &realm_training_backing, realm_training_backing.realm_args_backing, loss_invocation, realm_training_backing.allocators[0]); task_id_t task_id = loss_invocation.task_id; - register_wrapper_tasks_generic(realm_training_backing.worker_procs[0], + register_wrapper_tasks_generic(0, realm_training_backing.worker_procs[0], task_id); TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl(); // TODO: multi gpu launching diff --git a/lib/realm-backend/src/task_wrapper.cc b/lib/realm-backend/src/task_wrapper.cc index f07f11b60d..1a01fb0a58 100644 --- a/lib/realm-backend/src/task_wrapper.cc +++ b/lib/realm-backend/src/task_wrapper.cc @@ -38,21 +38,36 @@ void generic_wrapper_task(const void *args, size_t arglen, const void *userdata, fn(task_args.accessor); } -void register_wrapper_tasks_init(Processor p, task_id_t task_id) { +void register_wrapper_tasks_init(int p_id, Processor p, task_id_t task_id) { + std::pair key = {p_id, task_id}; + if (registered_tasks.find(key) != registered_tasks.end()) { + return; + } + registered_tasks.insert(key); Processor::register_task_by_kind( p.kind(), false /*!global*/, get_realm_task_id(task_id), CodeDescriptor(init_wrapper_task), ProfilingRequestSet()) .external_wait(); } -void register_wrapper_tasks_fwdbwd(Realm::Processor p, task_id_t task_id) { +void register_wrapper_tasks_fwdbwd(int p_id, Realm::Processor p, task_id_t task_id) { + std::pair key = {p_id, task_id}; + if (registered_tasks.find(key) != registered_tasks.end()) { + return; + } + registered_tasks.insert(key); Processor::register_task_by_kind( p.kind(), false /*!global*/, get_realm_task_id(task_id), CodeDescriptor(fwdbwd_wrapper_task), ProfilingRequestSet()) .external_wait(); } -void register_wrapper_tasks_generic(Realm::Processor p, task_id_t task_id) { +void register_wrapper_tasks_generic(int p_id, Realm::Processor p, task_id_t task_id) { + std::pair key = {p_id, task_id}; + if (registered_tasks.find(key) != registered_tasks.end()) { + return; + } + registered_tasks.insert(key); Processor::register_task_by_kind( p.kind(), false /*!global*/, get_realm_task_id(task_id), CodeDescriptor(generic_wrapper_task), ProfilingRequestSet()) @@ -61,21 +76,16 @@ void register_wrapper_tasks_generic(Realm::Processor p, task_id_t task_id) { void register_wrapper_tasks(int p_id, Processor p, task_id_t task_id, TaskSignatureAndImpl task_sig_impl) { - std::pair key = {p_id, task_id}; - if (registered_tasks.find(key) != registered_tasks.end()) { - return; - } - registered_tasks.insert(key); switch (task_sig_impl.task_signature.type) { case OpTaskType::INIT: - register_wrapper_tasks_init(p, task_id); + register_wrapper_tasks_init(p_id, p, task_id); break; case OpTaskType::FWD: case OpTaskType::BWD: - register_wrapper_tasks_fwdbwd(p, task_id); + register_wrapper_tasks_fwdbwd(p_id, p, task_id); break; default: - register_wrapper_tasks_generic(p, task_id); + register_wrapper_tasks_generic(p_id, p, task_id); break; } } diff --git a/lib/realm-backend/test/src/test_e2e.cc b/lib/realm-backend/test/src/test_e2e.cc index 659b0e5977..ba180494c3 100644 --- a/lib/realm-backend/test/src/test_e2e.cc +++ b/lib/realm-backend/test/src/test_e2e.cc @@ -10,136 +10,163 @@ #include "test_utils.h" #include "utils/containers/get_only.h" #include "realm-backend/model_training_instance.h" -#include using namespace ::FlexFlow; using namespace Realm; -bool did_loss_decrease(GenericTensorAccessorW const &first_epoch, GenericTensorAccessorW const & last_epoch) { - float* first_epoch_ptr = first_epoch.get_float_ptr(); - float* last_epoch_ptr = last_epoch.get_float_ptr(); - - int batch_size = first_epoch.shape.at(ff_dim_t{nonnegative_int{0}}).unwrap_nonnegative(); +bool did_loss_decrease(float *first_epoch, float *last_epoch, int batch_size) { for (int i = 0; i < batch_size; i++) { - if (first_epoch_ptr[i] < last_epoch_ptr[i]) { + if (first_epoch[i] < last_epoch[i]) { return false; } } - return true; } void top_level_task(const void *args, size_t arglen, const void *userdata, size_t userlen, Realm::Processor p) { - // initialize runtime - ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; - std::vector worker_procs; - std::vector allocators; - Machine::ProcessorQuery pq = Machine::ProcessorQuery(Machine::get_machine()) - .only_kind(Processor::TOC_PROC); - assert(pq.count() > 0); - for (Processor p : pq) { - worker_procs.push_back(p); - allocators.push_back(create_realm_memory_allocator(p)); + // initialize runtime + ManagedFFStream managed_stream{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); + std::vector worker_procs; + std::vector allocators; + Machine::ProcessorQuery pq = Machine::ProcessorQuery(Machine::get_machine()) + .only_kind(Processor::TOC_PROC); + assert(pq.count() > 0); + for (Processor p : pq) { + worker_procs.push_back(p); + allocators.push_back(create_realm_memory_allocator(p)); + } + + // allocate label tensors + LossTensorSource loss_tensor_source; + loss_tensor_t label_tensor = loss_tensor_source.new_loss_tensor(); + + nonnegative_int batch_size = 10_n; + nonnegative_int data_dim = 16_n; + nonnegative_int hidden_dim = 32_n; + nonnegative_int output_dim = 1_n; + + TensorShape output_tensor_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, output_dim}}, + DataType::FLOAT}; + + GenericTensorAccessorW label_tensor_backing = + allocators[0].allocate_tensor(output_tensor_shape); + AllocatedTensors allocated_tensors = AllocatedTensors{ + {{TensorTypeVariant{label_tensor}, label_tensor_backing}}, {}, {}}; + + // construct computation graph + ComputationGraph computation_graph = make_empty_computation_graph(); + + TensorShape input_tensor_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, data_dim}}, + DataType::FLOAT}; + + TensorShape weight_shape_1 = TensorShape{ + TensorDims{FFOrdered{data_dim, hidden_dim}}, + DataType::FLOAT}; + TensorShape weight_shape_2 = TensorShape{ + TensorDims{FFOrdered{hidden_dim, output_dim}}, + DataType::FLOAT}; + + LayerAddedResult inputs_layer = + add_input_layer_with_grad(computation_graph, input_tensor_shape); + + LayerAddedResult weights_layer_1 = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{ + weight_shape_1, InitializerAttrs{GlorotNormalAttrs{0}}}}, + std::nullopt}, + {}, + {}); + + LayerAddedResult weights_layer_2 = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{ + weight_shape_2, InitializerAttrs{GlorotNormalAttrs{0}}}}, + std::nullopt}, + {}, + {}); + + LayerAddedResult linear_operator_1 = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{hidden_dim, + /*use_bias=*/false, + DataType::FLOAT, + Activation::RELU, + std::nullopt}}, + std::nullopt}, + inputs_layer.outputs, + weights_layer_1.outputs); + + LayerAddedResult linear_operator_2 = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim, + /*use_bias=*/false, + DataType::FLOAT, + Activation::RELU, + std::nullopt}}, + std::nullopt}, + linear_operator_1.outputs, + weights_layer_2.outputs); + + tensor_guid_t logit_tensor = get_only(linear_operator_2.outputs); + + RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ + DeviceSpecific::create(managed_handle.raw_handle()), + EnableProfiling::YES, + ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}}; + + // initialize training backing + LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}; + OptimizerAttrs optimizer_attrs = + OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, + /*momentum=*/0.9, + /*nesterov=*/false, + /*weight_decay=*/0.001}}; + + + GradientTensorSource gradient_tensor_source; + OptimizerTensorSource optimizer_tensor_source; + + { + printf("\nRunning test %d: E2ETest...\n", 1); + RealmTrainingBacking realm_training_backing = RealmTrainingBacking( + p, worker_procs, allocators, allocated_tensors, gradient_tensor_source, + optimizer_tensor_source, computation_graph, runtime_arg_config, + optimizer_attrs); + // begin training loop + ModelTrainingInstance model_training_instance = ModelTrainingInstance{ + realm_training_backing, logit_tensor, label_tensor, loss_attrs, optimizer_attrs + }; + + int num_epochs = 5; + int num_samples = batch_size.unwrap_nonnegative(); + std::vector loss_values(num_epochs); + + for (int i = 0; i < num_epochs; i++) { + model_training_instance.forward(); + model_training_instance.backward(); + model_training_instance.update(); + float *host_loss_ptr = new float[num_samples]; + model_training_instance.write_loss_tensor_to_host(host_loss_ptr); + loss_values[i] = host_loss_ptr; } - // allocate label tensors - LossTensorSource loss_tensor_source; - loss_tensor_t label_tensor = - loss_tensor_source.new_loss_tensor(); - - nonnegative_int batch_size = 10_n; - nonnegative_int data_dim = 16_n; - nonnegative_int output_dim = 32_n; - - TensorShape output_tensor_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, output_dim}}, - DataType::FLOAT}; - - GenericTensorAccessorW label_tensor_backing = - allocators[0].allocate_tensor(output_tensor_shape); - AllocatedTensors allocated_tensors = AllocatedTensors{ - { - {TensorTypeVariant{label_tensor}, - label_tensor_backing}}, - {}, - {}}; - - // construct computation graph - ComputationGraph computation_graph = make_empty_computation_graph(); - - TensorShape input_tensor_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, data_dim}}, - DataType::FLOAT}; - - TensorShape weight_shape = TensorShape{ - TensorDims{FFOrdered{data_dim, output_dim}}, - DataType::FLOAT}; - - LayerAddedResult inputs_layer = - add_input_layer(computation_graph, input_tensor_shape); - - LayerAddedResult weights_layer = add_layer( - computation_graph, - LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{ - weight_shape, InitializerAttrs{GlorotNormalAttrs{0}}}}, - std::nullopt}, - {}, - {}); - - LayerAddedResult linear_operator = add_layer( - computation_graph, - LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim, - /*use_bias=*/false, - DataType::FLOAT, - Activation::RELU, - std::nullopt}}, - std::nullopt}, - inputs_layer.outputs, - weights_layer.outputs); - tensor_guid_t logit_tensor = get_only(linear_operator.outputs); - - RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ - DeviceSpecific::create(managed_handle.raw_handle()), - EnableProfiling::YES, - ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}}; - - // initialize training backing - LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}; - OptimizerAttrs optimizer_attrs = - OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, - /*momentum=*/0.9, - /*nesterov=*/false, - /*weight_decay=*/0.001}}; - - - GradientTensorSource gradient_tensor_source; - OptimizerTensorSource optimizer_tensor_source; - - { - printf("\nRunning test %d: E2ETest...\n", 1); - RealmTrainingBacking realm_training_backing = RealmTrainingBacking( - p, worker_procs, allocators, allocated_tensors, gradient_tensor_source, - optimizer_tensor_source, computation_graph, runtime_arg_config, - optimizer_attrs); - // begin training loop - ModelTrainingInstance model_training_instance = ModelTrainingInstance{ - realm_training_backing, logit_tensor, label_tensor, loss_attrs, optimizer_attrs - }; - - int num_epochs = 10; - std::vector loss_values (num_epochs); - - for (int i = 0; i < num_epochs; i++) { - model_training_instance.forward(); - model_training_instance.backward(); - model_training_instance.update(); - loss_values[i] = model_training_instance.get_loss_tensor_backing(); - } - // Assert that each sample in the batch has a lower loss in last epoch than the first epoch - CHECK(did_loss_decrease(loss_values[0], loss_values[num_epochs - 1])); + // Assert that each sample in the batch has a lower loss in last epoch than + // the first epoch + float *first_epoch = loss_values[0]; + float *last_epoch = loss_values[num_epochs - 1]; + if(did_loss_decrease( + first_epoch, last_epoch, batch_size.unwrap_nonnegative())) { printf("passed\n"); + } else { + printf("failed\n"); + } + + for (int i = 0; i < num_epochs; i++) { + delete[] loss_values[i]; } } } \ No newline at end of file diff --git a/lib/realm-backend/test/src/test_update.cc b/lib/realm-backend/test/src/test_update.cc index 0b332d1ccc..b1f6bebe74 100644 --- a/lib/realm-backend/test/src/test_update.cc +++ b/lib/realm-backend/test/src/test_update.cc @@ -16,7 +16,7 @@ void top_level_task(const void *args, size_t arglen, const void *userdata, size_t userlen, Realm::Processor p) { // initialize runtime configs ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); std::vector worker_procs; std::vector allocators; Machine::ProcessorQuery pq = Machine::ProcessorQuery(Machine::get_machine()) From 7755b9488a0858fa802bcc5b72d5588a0168400b Mon Sep 17 00:00:00 2001 From: fruitea Date: Wed, 28 May 2025 02:45:44 -0700 Subject: [PATCH 81/91] fix: TaskArgumentAccessor has an share_ptr object, which need to be handled across threads. --- .../src/realm_training_backing.cc | 33 ++++++++++------- lib/realm-backend/src/task_wrapper.cc | 37 ++++++++++++------- 2 files changed, 42 insertions(+), 28 deletions(-) diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc index e2e28e9929..4e36bf8d5c 100644 --- a/lib/realm-backend/src/realm_training_backing.cc +++ b/lib/realm-backend/src/realm_training_backing.cc @@ -137,11 +137,12 @@ initialize_args_backing(RealmTrainingBacking *backing, // TODO: multi gpu launching Promise promise = Promise(); Future future = promise.get_future(); - RealmTaskArgs args{ + RealmTaskArgs* task_arg = new RealmTaskArgs{ task_id, impl_function, accessor, std::move(promise)}; + uintptr_t args[1] = {reinterpret_cast(task_arg)}; Event e = worker_procs[0].spawn(get_realm_task_id(task_id), - &args, sizeof(args), worker_events[0]); + args, sizeof(uintptr_t), worker_events[0]); worker_events[0] = e; future.set_event(e); per_device_op_states.insert({node, future.get().value()}); @@ -185,10 +186,11 @@ execute_forward(RealmTrainingBacking &realm_training_backing, // TODO: multi gpu launching Promise promise(realm_training_backing.master_mem); Future future = promise.get_future(); - RealmTaskArgs args{task_id, impl_function, accessor, - std::move(promise)}; + RealmTaskArgs* task_arg = new RealmTaskArgs{task_id, impl_function, accessor, + std::move(promise)}; + uintptr_t args[1] = {reinterpret_cast(task_arg)}; Event e = realm_training_backing.worker_procs[0].spawn( - get_realm_task_id(task_id), &args, sizeof(args), + get_realm_task_id(task_id), args, sizeof(uintptr_t), realm_training_backing.worker_events[0]); realm_training_backing.worker_events[0] = e; future.set_event(e); @@ -232,10 +234,11 @@ execute_backward(RealmTrainingBacking &realm_training_backing, // TODO: multi gpu launching Promise promise(realm_training_backing.master_mem); Future future = promise.get_future(); - RealmTaskArgs args{task_id, impl_function, accessor, - std::move(promise)}; + RealmTaskArgs* task_arg = new RealmTaskArgs{task_id, impl_function, accessor, + std::move(promise)}; + uintptr_t args[1] = {reinterpret_cast(task_arg)}; Event e = realm_training_backing.worker_procs[0].spawn( - get_realm_task_id(task_id), &args, sizeof(args), + get_realm_task_id(task_id), args, sizeof(uintptr_t), realm_training_backing.worker_events[0]); realm_training_backing.worker_events[0] = e; future.set_event(e); @@ -282,10 +285,11 @@ Future execute_update(RealmTrainingBacking &realm_training_backing, // TODO: multi gpu launching Promise promise; Future future = promise.get_future(); - RealmTaskArgs args{task_id, update_impl_fn, accessor, - std::move(promise)}; + RealmTaskArgs* task_arg = new RealmTaskArgs{task_id, update_impl_fn, accessor, + std::move(promise)}; + uintptr_t args[1] = {reinterpret_cast(task_arg)}; Event e = realm_training_backing.worker_procs[0].spawn( - get_realm_task_id(task_id), &args, sizeof(args), + get_realm_task_id(task_id), args, sizeof(uintptr_t), realm_training_backing.worker_events[0]); realm_training_backing.worker_events[0] = e; future.set_event(e); @@ -317,10 +321,11 @@ Future compute_loss(RealmTrainingBacking &realm_training_backing, // TODO: multi gpu launching Promise promise; Future future = promise.get_future(); - RealmTaskArgs args{task_id, loss_impl_fn, loss_accessor, - std::move(promise)}; + RealmTaskArgs* task_arg = new RealmTaskArgs{task_id, loss_impl_fn, loss_accessor, + std::move(promise)}; + uintptr_t args[1] = {reinterpret_cast(task_arg)}; Event e = realm_training_backing.worker_procs[0].spawn( - get_realm_task_id(task_id), &args, sizeof(args), + get_realm_task_id(task_id), args, sizeof(uintptr_t), realm_training_backing.worker_events[0]); realm_training_backing.worker_events[0] = e; future.set_event(e); diff --git a/lib/realm-backend/src/task_wrapper.cc b/lib/realm-backend/src/task_wrapper.cc index 1a01fb0a58..cb220f44dc 100644 --- a/lib/realm-backend/src/task_wrapper.cc +++ b/lib/realm-backend/src/task_wrapper.cc @@ -11,31 +11,40 @@ std::unordered_set> registered_tasks; void init_wrapper_task(const void *args, size_t arglen, const void *userdata, size_t userlen, Processor p) { - RealmTaskArgs const &task_args = - *reinterpret_cast *>(args); + assert(arglen == sizeof(uintptr_t)); + uintptr_t task_arg_ptr = *reinterpret_cast(args); + RealmTaskArgs *task_args = + reinterpret_cast *>(task_arg_ptr); auto fn = - task_args.impl_function.get().function_ptr; - DeviceSpecificDeviceStates result = fn(task_args.accessor); - task_args.promise.set_value(result); + task_args->impl_function.get().function_ptr; + DeviceSpecificDeviceStates result = fn(task_args->accessor); + task_args->promise.set_value(result); + delete task_args; } void fwdbwd_wrapper_task(const void *args, size_t arglen, const void *userdata, size_t userlen, Processor p) { - RealmTaskArgs const &task_args = - *reinterpret_cast *>(args); + assert(arglen == sizeof(uintptr_t)); + uintptr_t task_arg_ptr = *reinterpret_cast(args); + RealmTaskArgs *task_args = + reinterpret_cast *>(task_arg_ptr); auto fn = - task_args.impl_function.get().function_ptr; - std::optional result = fn(task_args.accessor); - task_args.promise.set_value(result.has_value() ? result.value() : 0.0f); + task_args->impl_function.get().function_ptr; + std::optional result = fn(task_args->accessor); + task_args->promise.set_value(result.has_value() ? result.value() : 0.0f); + delete task_args; } void generic_wrapper_task(const void *args, size_t arglen, const void *userdata, size_t userlen, Processor p) { - RealmTaskArgs const &task_args = - *reinterpret_cast *>(args); + assert(arglen == sizeof(uintptr_t)); + uintptr_t task_arg_ptr = *reinterpret_cast(args); + RealmTaskArgs *task_args = + reinterpret_cast *>(task_arg_ptr); auto fn = - task_args.impl_function.get().function_ptr; - fn(task_args.accessor); + task_args->impl_function.get().function_ptr; + fn(task_args->accessor); + delete task_args; } void register_wrapper_tasks_init(int p_id, Processor p, task_id_t task_id) { From 335ac6d5abb2a67efb86363aefb38986ce181b1a Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 11 Jun 2025 08:47:18 -0700 Subject: [PATCH 82/91] Expose test kernels, fill weights --- .../src/kernels/format_accessor_contents.cc | 8 +-- .../test/src/cpu/ops/replicate_kernels.cc | 2 +- .../test/src/cpu/ops/reverse_kernels.cc | 2 +- lib/kernels/test/src/kernels/accessor.cc | 2 +- .../src/kernels/compare_tensor_accessors.cc | 2 +- .../src/kernels/format_accessor_contents.cc | 2 +- .../src/kernels/reduce_tensor_accessor.cc | 2 +- lib/kernels/test/src/test_attention_kernel.cc | 2 +- .../test/src/test_batch_matmul_kernel.cc | 2 +- .../test/src/test_batch_norm_kernel.cc | 2 +- lib/kernels/test/src/test_cast_kernel.cc | 2 +- lib/kernels/test/src/test_combine_kernel.cc | 2 +- lib/kernels/test/src/test_concat_kernel.cc | 2 +- lib/kernels/test/src/test_cuda.cc | 2 +- lib/kernels/test/src/test_dropout.cc | 2 +- lib/kernels/test/src/test_flat_kernel.cc | 2 +- lib/kernels/test/src/test_gather_kernels.cc | 2 +- .../test/src/test_layer_norm_kernels.cc | 2 +- .../test/src/test_managed_ff_stream.cc | 2 +- lib/kernels/test/src/test_partition_kernel.cc | 2 +- lib/kernels/test/src/test_pool_2d_kernels.cc | 2 +- lib/kernels/test/src/test_reduction_kernel.cc | 2 +- lib/kernels/test/src/test_replicate_kernel.cc | 8 +-- lib/kernels/test/src/test_reshape_kernel.cc | 2 +- lib/kernels/test/src/test_reverse_kernels.cc | 2 +- lib/kernels/test/src/test_softmax_kernel.cc | 2 +- lib/kernels/test/src/test_split_kernel.cc | 2 +- lib/kernels/test/src/test_transpose_kernel.cc | 2 +- lib/local-execution/src/loss_functions.cc | 14 ++++- .../src/model_training_instance.cc | 25 +++++++++ .../src/unallocated_tensors.cc | 1 - lib/local-execution/test/src/test_e2e.cc | 52 ++++++++++++------- lib/task-spec/src/task-spec/ops/linear.cc | 29 ++++++++++- 33 files changed, 132 insertions(+), 57 deletions(-) diff --git a/lib/kernels/src/kernels/format_accessor_contents.cc b/lib/kernels/src/kernels/format_accessor_contents.cc index ed54b21cfd..d40e5c4268 100644 --- a/lib/kernels/src/kernels/format_accessor_contents.cc +++ b/lib/kernels/src/kernels/format_accessor_contents.cc @@ -161,14 +161,14 @@ std::string format_accessor_r_contents(GenericTensorAccessorR const &accessor) { GenericTensorAccessorR cpu_accessor = copy_tensor_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator); - int num_dims = accessor.shape.num_dims().unwrap_nonnegative(); + int num_dims = cpu_accessor.shape.num_dims().unwrap_nonnegative(); switch (num_dims) { case 1: - return format_1d_accessor_r_contents(accessor); + return format_1d_accessor_r_contents(cpu_accessor); case 2: - return format_2d_accessor_r_contents(accessor); + return format_2d_accessor_r_contents(cpu_accessor); case 3: - return format_3d_accessor_r_contents(accessor); + return format_3d_accessor_r_contents(cpu_accessor); default: PANIC("Unhandled accessor dimensionality", num_dims); } diff --git a/lib/kernels/test/src/cpu/ops/replicate_kernels.cc b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc index b98b1745d5..1984fd5f83 100644 --- a/lib/kernels/test/src/cpu/ops/replicate_kernels.cc +++ b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc @@ -1,4 +1,4 @@ -#include "internal/test_utils.h" +#include "kernels/test_utils.h" #include "kernels/create_accessor_with_contents.h" #include "kernels/format_accessor_contents.h" #include "kernels/replicate_kernels_cpu.h" diff --git a/lib/kernels/test/src/cpu/ops/reverse_kernels.cc b/lib/kernels/test/src/cpu/ops/reverse_kernels.cc index 51025cd17b..5e27b9d350 100644 --- a/lib/kernels/test/src/cpu/ops/reverse_kernels.cc +++ b/lib/kernels/test/src/cpu/ops/reverse_kernels.cc @@ -1,4 +1,4 @@ -#include "internal/test_utils.h" +#include "kernels/test_utils.h" #include "kernels/create_accessor_with_contents.h" #include "kernels/format_accessor_contents.h" #include "kernels/reverse_kernels_cpu.h" diff --git a/lib/kernels/test/src/kernels/accessor.cc b/lib/kernels/test/src/kernels/accessor.cc index 45e83cc0c6..31a6cba205 100644 --- a/lib/kernels/test/src/kernels/accessor.cc +++ b/lib/kernels/test/src/kernels/accessor.cc @@ -1,5 +1,5 @@ #include "kernels/accessor.h" -#include "internal/test_utils.h" +#include "kernels/test_utils.h" #include "kernels/create_accessor_with_contents.h" #include "kernels/local_cpu_allocator.h" #include diff --git a/lib/kernels/test/src/kernels/compare_tensor_accessors.cc b/lib/kernels/test/src/kernels/compare_tensor_accessors.cc index 85ffa91315..4e85dfdaa0 100644 --- a/lib/kernels/test/src/kernels/compare_tensor_accessors.cc +++ b/lib/kernels/test/src/kernels/compare_tensor_accessors.cc @@ -1,5 +1,5 @@ #include "kernels/compare_tensor_accessors.h" -#include "internal/test_utils.h" +#include "kernels/test_utils.h" #include "kernels/create_accessor_with_contents.h" #include "kernels/format_accessor_contents.h" #include "test/utils/doctest/check_kv.h" diff --git a/lib/kernels/test/src/kernels/format_accessor_contents.cc b/lib/kernels/test/src/kernels/format_accessor_contents.cc index f515f2495b..a2b61b8dff 100644 --- a/lib/kernels/test/src/kernels/format_accessor_contents.cc +++ b/lib/kernels/test/src/kernels/format_accessor_contents.cc @@ -1,5 +1,5 @@ #include "kernels/format_accessor_contents.h" -#include "internal/test_utils.h" +#include "kernels/test_utils.h" #include "kernels/create_accessor_with_contents.h" #include "kernels/local_cpu_allocator.h" #include diff --git a/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc b/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc index a269cf4777..dd5f8e06f6 100644 --- a/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc +++ b/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc @@ -1,5 +1,5 @@ #include "kernels/reduce_tensor_accessor.h" -#include "internal/test_utils.h" +#include "kernels/test_utils.h" #include "kernels/create_accessor_with_contents.h" #include "kernels/format_accessor_contents.h" #include "test/utils/doctest/check_kv.h" diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc index f80c080f11..a086974a74 100644 --- a/lib/kernels/test/src/test_attention_kernel.cc +++ b/lib/kernels/test/src/test_attention_kernel.cc @@ -1,4 +1,4 @@ -#include "internal/test_utils.h" +#include "kernels/test_utils.h" #include "kernels/attention_kernels.h" #include diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc index dd98a36094..b0fe356c95 100644 --- a/lib/kernels/test/src/test_batch_matmul_kernel.cc +++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc @@ -1,4 +1,4 @@ -#include "internal/test_utils.h" +#include "kernels/test_utils.h" #include "kernels/batch_matmul_kernels.h" #include diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index 534901daf2..c173fd6d24 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -1,4 +1,4 @@ -#include "internal/test_utils.h" +#include "kernels/test_utils.h" #include "kernels/batch_norm_kernels.h" #include "op-attrs/datatype_value.h" #include diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index 7539b2457c..9472e44a15 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -1,4 +1,4 @@ -#include "internal/test_utils.h" +#include "kernels/test_utils.h" #include "kernels/cast_kernels.h" #include "kernels/cast_kernels_cpu.h" #include diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc index f3a2a8153d..7ac4d0f881 100644 --- a/lib/kernels/test/src/test_combine_kernel.cc +++ b/lib/kernels/test/src/test_combine_kernel.cc @@ -1,4 +1,4 @@ -#include "internal/test_utils.h" +#include "kernels/test_utils.h" #include "kernels/combine_kernels.h" #include "kernels/combine_kernels_cpu.h" #include diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc index 397b5cdf90..5dc8e441bd 100644 --- a/lib/kernels/test/src/test_concat_kernel.cc +++ b/lib/kernels/test/src/test_concat_kernel.cc @@ -1,4 +1,4 @@ -#include "internal/test_utils.h" +#include "kernels/test_utils.h" #include "kernels/concat_kernels.h" #include "utils/containers/repeat.h" #include diff --git a/lib/kernels/test/src/test_cuda.cc b/lib/kernels/test/src/test_cuda.cc index de3215cf2d..60bc6251b2 100644 --- a/lib/kernels/test/src/test_cuda.cc +++ b/lib/kernels/test/src/test_cuda.cc @@ -1,4 +1,4 @@ -#include "internal/test_utils.h" +#include "kernels/test_utils.h" #include #include diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index c4518293dd..fb8b8dc87c 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -1,4 +1,4 @@ -#include "internal/test_utils.h" +#include "kernels/test_utils.h" #include "kernels/dropout_kernels.h" #include "utils/containers/count.h" #include diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc index 14930e280b..cea07ce781 100644 --- a/lib/kernels/test/src/test_flat_kernel.cc +++ b/lib/kernels/test/src/test_flat_kernel.cc @@ -1,4 +1,4 @@ -#include "internal/test_utils.h" +#include "kernels/test_utils.h" #include "kernels/flat_kernels.h" #include "op-attrs/datatype_value.h" #include diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc index 365fd3fb81..6a553bd107 100644 --- a/lib/kernels/test/src/test_gather_kernels.cc +++ b/lib/kernels/test/src/test_gather_kernels.cc @@ -1,4 +1,4 @@ -#include "internal/test_utils.h" +#include "kernels/test_utils.h" #include "kernels/gather_kernels.h" #include diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc index 3e63294e78..5382bb3a84 100644 --- a/lib/kernels/test/src/test_layer_norm_kernels.cc +++ b/lib/kernels/test/src/test_layer_norm_kernels.cc @@ -1,4 +1,4 @@ -#include "internal/test_utils.h" +#include "kernels/test_utils.h" #include "kernels/layer_norm_kernels.h" #include "op-attrs/datatype_value.h" #include diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc index ed2d8dc2b6..25a346446b 100644 --- a/lib/kernels/test/src/test_managed_ff_stream.cc +++ b/lib/kernels/test/src/test_managed_ff_stream.cc @@ -1,4 +1,4 @@ -#include "internal/test_utils.h" +#include "kernels/test_utils.h" #include "kernels/gather_kernels.h" #include diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc index 40a9eead53..c042ae3175 100644 --- a/lib/kernels/test/src/test_partition_kernel.cc +++ b/lib/kernels/test/src/test_partition_kernel.cc @@ -1,4 +1,4 @@ -#include "internal/test_utils.h" +#include "kernels/test_utils.h" #include "kernels/partition_kernels.h" #include "op-attrs/datatype_value.h" #include diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index a999311b81..58fff5c884 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -1,4 +1,4 @@ -#include "internal/test_utils.h" +#include "kernels/test_utils.h" #include "kernels/pool_2d_kernels.h" #include "op-attrs/datatype_value.h" #include diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc index e2c4c36a71..4d030c4d93 100644 --- a/lib/kernels/test/src/test_reduction_kernel.cc +++ b/lib/kernels/test/src/test_reduction_kernel.cc @@ -1,4 +1,4 @@ -#include "internal/test_utils.h" +#include "kernels/test_utils.h" #include "kernels/reduction_kernels.h" #include "op-attrs/datatype_value.h" #include diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc index 5f58239a31..9806cefe8d 100644 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ b/lib/kernels/test/src/test_replicate_kernel.cc @@ -1,4 +1,4 @@ -#include "internal/test_utils.h" +#include "kernels/test_utils.h" #include "kernels/create_accessor_with_contents.h" #include "kernels/format_accessor_contents.h" #include "kernels/replicate_kernels.h" @@ -31,7 +31,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { SUBCASE("forward_kernel") { GenericTensorAccessorR input = - create_1d_accessor_r_with_contents({1, 3, 2}, gpu_allocator); + create_1d_accessor_r_with_contents({1, 3, 2}, gpu_allocator); GenericTensorAccessorW output = gpu_allocator.allocate_tensor(output_shape); @@ -47,7 +47,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { SUBCASE("backward_kernel") { GenericTensorAccessorR output_grad = - create_2d_accessor_r_with_contents( + create_2d_accessor_r_with_contents( { {1, 2, 3}, {4, 3, 3}, @@ -56,7 +56,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { gpu_allocator); GenericTensorAccessorR correct = - create_1d_accessor_r_with_contents( + create_1d_accessor_r_with_contents( {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator); GenericTensorAccessorW input_grad = diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc index 066db28a17..011f35e567 100644 --- a/lib/kernels/test/src/test_reshape_kernel.cc +++ b/lib/kernels/test/src/test_reshape_kernel.cc @@ -1,4 +1,4 @@ -#include "internal/test_utils.h" +#include "kernels/test_utils.h" #include "kernels/reshape_kernels.h" #include diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index 6a0ad84a92..fc5c8deaad 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -1,4 +1,4 @@ -#include "internal/test_utils.h" +#include "kernels/test_utils.h" #include "kernels/reverse_kernels.h" #include "kernels/reverse_kernels_cpu.h" #include "op-attrs/datatype_value.h" diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index bf10b5c633..bb449f6755 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -1,4 +1,4 @@ -#include "internal/test_utils.h" +#include "kernels/test_utils.h" #include "kernels/softmax_kernels.h" #include diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index 1c1c4d4d51..2597db95e0 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -1,4 +1,4 @@ -#include "internal/test_utils.h" +#include "kernels/test_utils.h" #include "kernels/split_kernels.h" #include "op-attrs/datatype_value.h" #include "utils/containers/repeat.h" diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index 8560d33e5b..c0b2d4db5e 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -1,4 +1,4 @@ -#include "internal/test_utils.h" +#include "kernels/test_utils.h" #include "kernels/transpose_kernels.h" #include diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc index c23159a85d..99225b1895 100644 --- a/lib/local-execution/src/loss_functions.cc +++ b/lib/local-execution/src/loss_functions.cc @@ -16,6 +16,7 @@ #include "op-attrs/ops/loss_functions.h" #include "kernels/loss_function_kernels.h" #include "local-execution/loss_functions.h" +#include "kernels/format_accessor_contents.h" #include "task-spec/profiling.h" #include "utils/nonnegative_int/nonnegative_int.h" @@ -55,6 +56,7 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { auto logit_grad = acc.get_tensor_grad(LOGIT_GRAD); auto logit = acc.get_tensor(LOGIT); auto label = acc.get_loss_tensor(LABEL); + int batch_size = logit.shape.at(legion_dim_t{1_n}).int_from_positive_int(); // assuming logit shape is [batch dim, num classes] @@ -109,15 +111,23 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { logit.shape.at(legion_dim_t{0_n}).int_from_positive_int(); switch (loss_type) { case LossFunction::CATEGORICAL_CROSSENTROPY: { + size_t logit_volume = get_num_elements(logit.shape).int_from_positive_int(); + size_t logit_grad_volume = + get_num_elements(logit_grad.shape).int_from_positive_int(); + profile(categorical_crossentropy_loss_backward_kernel, profiling, "[CategoricalCrossEntropyLoss] backward_time = %.2lfms\n", get_float_ptr(logit_grad), get_float_ptr(logit), get_float_ptr(label), - get_num_elements(logit.shape).int_from_positive_int(), - get_num_elements(logit_grad.shape).int_from_positive_int(), + logit_volume, + logit_grad_volume, scale_factor); + + + std::cout << "Logit grad (loss) tensor after computation" << std::endl; + std::cout << format_accessor_w_contents(logit_grad) << std::endl; break; } case LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE: { diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc index 790c5e8e18..847b1679ac 100644 --- a/lib/local-execution/src/model_training_instance.cc +++ b/lib/local-execution/src/model_training_instance.cc @@ -1,4 +1,5 @@ #include "local-execution/model_training_instance.h" +#include "kernels/format_accessor_contents.h" #include "pcg/computation_graph.h" #include "pcg/optimizer_attrs.h" #include "utils/containers/reversed.h" @@ -34,6 +35,17 @@ PerLayerElapsedTime ModelTrainingInstance::backward() { this->label_tensor, this->allocator); + std::cout << "Done computing loss" << std::endl; + gradient_tensor_t loss_tensor = + this->training_backing.local_tensor_backing.tensor_gradient_mapping.at( + this->logit_tensor); + GenericTensorAccessorW loss_tensor_backing = + this->training_backing.local_tensor_backing.tensor_backings.at( + TensorTypeVariant{loss_tensor}); + + std::cout << "Loss (logit grad) tensor" << std::endl; + std::cout << format_accessor_w_contents(loss_tensor_backing) << std::endl; + PerLayerElapsedTime per_layer_elapsed_time; for (layer_guid_t const &node : reversed( topological_ordering(this->training_backing.computation_graph))) { @@ -55,12 +67,25 @@ void ModelTrainingInstance::update() { } GenericTensorAccessorR ModelTrainingInstance::get_loss_tensor_accessor() const { + GenericTensorAccessorW logit_tensor_backing = this->training_backing + .local_tensor_backing.tensor_backings.at(TensorTypeVariant{this->logit_tensor}); + + // for (auto const &pair : + // this->training_backing.local_tensor_backing.tensor_backings) { + // std::cout << "Tensor type: " << pair.first << std::endl; + // std::cout << "Tensor " << std::endl; + // std::cout << format_accessor_w_contents(pair.second) << std::endl; + // } + gradient_tensor_t loss_tensor = this->training_backing.local_tensor_backing.tensor_gradient_mapping.at( this->logit_tensor); GenericTensorAccessorW loss_tensor_backing = this->training_backing.local_tensor_backing.tensor_backings.at( TensorTypeVariant{loss_tensor}); + + std::cout << "Loss (logit grad) tensor" << std::endl; + std::cout << format_accessor_w_contents(loss_tensor_backing) << std::endl; return read_only_accessor_from_write_accessor(loss_tensor_backing); } diff --git a/lib/local-execution/src/unallocated_tensors.cc b/lib/local-execution/src/unallocated_tensors.cc index 363d1eedef..b8daa90e3b 100644 --- a/lib/local-execution/src/unallocated_tensors.cc +++ b/lib/local-execution/src/unallocated_tensors.cc @@ -70,7 +70,6 @@ UnallocatedTensors generate_unallocated_tensors_with_optimizer( num_optimizer_tensors_to_allocate -= allocated_tensors.optimizer_mapping.at(tensor_guid).size(); } - std::cout << num_optimizer_tensors_to_allocate; for (int i = 0; i < num_optimizer_tensors_to_allocate; ++i) { optimizer_tensor_t optimizer_tensor = diff --git a/lib/local-execution/test/src/test_e2e.cc b/lib/local-execution/test/src/test_e2e.cc index 8827e0269d..80b2e6a398 100644 --- a/lib/local-execution/test/src/test_e2e.cc +++ b/lib/local-execution/test/src/test_e2e.cc @@ -1,10 +1,7 @@ #include "kernels/compare_tensor_accessors.h" -#include "kernels/copy_tensor_accessor.h" -#include "kernels/local_cpu_allocator.h" -#include "kernels/local_cuda_allocator.h" -#include "kernels/managed_ff_stream.h" -#include "kernels/managed_per_device_ff_handle.h" +#include "kernels/format_accessor_contents.h" #include "kernels/tensor_accessor_reductions.h" +#include "kernels/test_utils.h" #include "local-execution/allocated_tensors.h" #include "local-execution/local_training_backing.h" #include "local-execution/model_training_instance.h" @@ -45,32 +42,33 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { positive_int hidden_dim = 32_p; positive_int output_dim = 1_p; + TensorShape input_tensor_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; TensorShape output_tensor_shape = TensorShape{ TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT}; - GenericTensorAccessorW label_tensor_backing = - allocator.allocate_tensor(output_tensor_shape); - AllocatedTensors allocated_tensors = AllocatedTensors{ - /*tensor_type_backings=*/{ - {TensorTypeVariant{label_tensor}, label_tensor_backing}, - }, - /*gradient_mapping=*/{}, - /*optimizer_mapping*/ {}, - }; + GenericTensorAccessorW label_tensor_backing = create_random_filled_accessor_w( + output_tensor_shape, allocator); // construct computation graph ComputationGraph computation_graph = make_empty_computation_graph(); - TensorShape input_tensor_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; TensorShape weight_shape_1 = TensorShape{ TensorDims{FFOrdered{data_dim, hidden_dim}}, DataType::FLOAT}; TensorShape weight_shape_2 = TensorShape{ TensorDims{FFOrdered{hidden_dim, output_dim}}, DataType::FLOAT}; + GenericTensorAccessorW weight_1_backing = create_random_filled_accessor_w( + weight_shape_1, allocator); + GenericTensorAccessorW weight_2_backing = create_random_filled_accessor_w( + weight_shape_2, allocator); + LayerAddedResult inputs_layer = add_input_layer_with_grad(computation_graph, input_tensor_shape); + tensor_guid_t input_tensor_guid = get_only(inputs_layer.outputs); + GenericTensorAccessorW input_tensor_backing = create_random_filled_accessor_w( + input_tensor_shape, allocator); LayerAddedResult weights_layer_1 = add_layer( computation_graph, @@ -79,6 +77,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { std::nullopt}, {}, {}); + tensor_guid_t weight_1_tensor_guid = get_only(weights_layer_1.outputs); LayerAddedResult weights_layer_2 = add_layer( computation_graph, @@ -87,13 +86,14 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { std::nullopt}, {}, {}); + tensor_guid_t weight_2_tensor_guid = get_only(weights_layer_2.outputs); LayerAddedResult linear_operator_1 = add_layer( computation_graph, LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{hidden_dim, /*use_bias=*/false, DataType::FLOAT, - Activation::RELU, + std::nullopt, std::nullopt}}, std::nullopt}, inputs_layer.outputs, @@ -104,7 +104,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim, /*use_bias=*/false, DataType::FLOAT, - Activation::RELU, + std::nullopt, std::nullopt}}, std::nullopt}, linear_operator_1.outputs, @@ -129,6 +129,17 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { GradientTensorSource gradient_tensor_source; OptimizerTensorSource optimizer_tensor_source; + AllocatedTensors allocated_tensors = AllocatedTensors{ + /*tensor_type_backings=*/{ + {TensorTypeVariant{label_tensor}, label_tensor_backing}, + {TensorTypeVariant{input_tensor_guid}, input_tensor_backing}, + {TensorTypeVariant{weight_1_tensor_guid}, weight_1_backing}, + {TensorTypeVariant{weight_2_tensor_guid}, weight_2_backing}, + }, + /*gradient_mapping=*/{}, + /*optimizer_mapping*/ {}, + }; + LocalTrainingBacking local_training_backing = LocalTrainingBacking{allocator, allocated_tensors, @@ -162,8 +173,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { // Assert that each sample in the batch has a lower loss in last epoch than // the first epoch + std::cout << "Final loss values" << std::endl; GenericTensorAccessorR first_epoch_loss = loss_values.at(0); + std::cout << format_accessor_r_contents(first_epoch_loss) << std::endl; + GenericTensorAccessorR last_epoch = loss_values.back(); + std::cout << format_accessor_r_contents(last_epoch) << std::endl; + CHECK(did_loss_decrease(first_epoch_loss, last_epoch)); } } diff --git a/lib/task-spec/src/task-spec/ops/linear.cc b/lib/task-spec/src/task-spec/ops/linear.cc index 5e56ccdc1b..b56931b3f3 100644 --- a/lib/task-spec/src/task-spec/ops/linear.cc +++ b/lib/task-spec/src/task-spec/ops/linear.cc @@ -1,5 +1,6 @@ #include "task-spec/ops/linear.h" #include "kernels/linear_kernels.h" +#include "kernels/format_accessor_contents.h" #include "op-attrs/ff_dim_t.h" #include "task-spec/task_argument_accessor.h" #include "utils/exception.h" @@ -90,6 +91,12 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto weight = acc.get_tensor(WEIGHT); auto output = acc.get_tensor(OUTPUT); + std::cout << "Input tensor" << std::endl; + std::cout << format_accessor_r_contents(input) << std::endl; + + std::cout << "Weight tensor" << std::endl; + std::cout << format_accessor_r_contents(weight) << std::endl; + auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); @@ -105,7 +112,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { bias_ptr = bias.get_float_ptr(); } - return profile(forward_kernel, + auto result = profile(forward_kernel, profiling, "[Linear] forward_time = {:.2lf}ms\n", per_device_state, @@ -116,6 +123,11 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { in_dim.int_from_positive_int(), out_dim.int_from_positive_int(), batch_size.int_from_positive_int()); + + std::cout << "Output tensor" << std::endl; + std::cout << format_accessor_w_contents(output) << std::endl; + + return result; } static std::optional @@ -128,6 +140,12 @@ static std::optional auto weight_grad = acc.get_tensor_grad(WEIGHT); auto output_grad = acc.get_tensor_grad(OUTPUT); + std::cout << "output grad tensor" << std::endl; + std::cout << format_accessor_w_contents(output_grad) << std::endl; + + std::cout << "weight grad tensor" << std::endl; + std::cout << format_accessor_w_contents(weight_grad) << std::endl; + auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); @@ -143,7 +161,7 @@ static std::optional positive_int out_dim = output.shape.at(ff_dim_t{0_n}); positive_int batch_size = positive_int{output.shape.num_elements() / out_dim}; - return profile(backward_kernel, + auto result = profile(backward_kernel, profiling, "[Linear] backward_time = {:.2lf}ms\n", per_device_state, @@ -157,6 +175,13 @@ static std::optional in_dim.int_from_positive_int(), out_dim.int_from_positive_int(), batch_size.int_from_positive_int()); + std::cout << "output grad tensor after backward kernel" << std::endl; + std::cout << format_accessor_w_contents(output_grad) << std::endl; + + std::cout << "weight grad tensor after backward kernel" << std::endl; + std::cout << format_accessor_w_contents(weight_grad) << std::endl; + + return result; } TaskImplFunction get_linear_init_task_impl() { From dbbb57434600d14d1b74da7513a8eeeca98df594 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Wed, 11 Jun 2025 08:47:32 -0700 Subject: [PATCH 83/91] Expose test utils --- lib/kernels/{test/src/internal => include/kernels}/test_utils.h | 0 lib/kernels/{test/src/internal => src}/test_utils.cc | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename lib/kernels/{test/src/internal => include/kernels}/test_utils.h (100%) rename lib/kernels/{test/src/internal => src}/test_utils.cc (99%) diff --git a/lib/kernels/test/src/internal/test_utils.h b/lib/kernels/include/kernels/test_utils.h similarity index 100% rename from lib/kernels/test/src/internal/test_utils.h rename to lib/kernels/include/kernels/test_utils.h diff --git a/lib/kernels/test/src/internal/test_utils.cc b/lib/kernels/src/test_utils.cc similarity index 99% rename from lib/kernels/test/src/internal/test_utils.cc rename to lib/kernels/src/test_utils.cc index a9ba8dea13..67f2fb624a 100644 --- a/lib/kernels/test/src/internal/test_utils.cc +++ b/lib/kernels/src/test_utils.cc @@ -1,4 +1,4 @@ -#include "internal/test_utils.h" +#include "kernels/test_utils.h" #include "op-attrs/tensor_shape.h" #include "utils/containers/require_all_same1.h" #include "utils/join_strings.h" From a4c1ea4e1eddabec041c12b31092e8757c026be7 Mon Sep 17 00:00:00 2001 From: Reyna Abhyankar Date: Mon, 16 Jun 2025 22:25:37 -0700 Subject: [PATCH 84/91] Remove prints --- .../src/local_cost_estimator.cc | 2 -- lib/local-execution/src/loss_functions.cc | 2 -- .../src/model_training_instance.cc | 12 ----------- lib/local-execution/test/src/test_e2e.cc | 3 --- lib/task-spec/src/task-spec/ops/linear.cc | 20 ------------------- 5 files changed, 39 deletions(-) diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc index 0a84c19066..85f315c7d1 100644 --- a/lib/local-execution/src/local_cost_estimator.cc +++ b/lib/local-execution/src/local_cost_estimator.cc @@ -95,10 +95,8 @@ CostDetails LocalCostEstimator::estimate_cost( float fwd = execute_forward(local_backing, operator_layer_guid, allocator).value(); - std::cout << "completed forward" << std::endl; float bwd = execute_backward(local_backing, operator_layer_guid, allocator).value(); - std::cout << "completed backward" << std::endl; float total_execution_time = fwd + bwd; diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc index 99225b1895..4d0b32fd48 100644 --- a/lib/local-execution/src/loss_functions.cc +++ b/lib/local-execution/src/loss_functions.cc @@ -126,8 +126,6 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { scale_factor); - std::cout << "Logit grad (loss) tensor after computation" << std::endl; - std::cout << format_accessor_w_contents(logit_grad) << std::endl; break; } case LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE: { diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc index 847b1679ac..d3c1c65a68 100644 --- a/lib/local-execution/src/model_training_instance.cc +++ b/lib/local-execution/src/model_training_instance.cc @@ -35,16 +35,12 @@ PerLayerElapsedTime ModelTrainingInstance::backward() { this->label_tensor, this->allocator); - std::cout << "Done computing loss" << std::endl; gradient_tensor_t loss_tensor = this->training_backing.local_tensor_backing.tensor_gradient_mapping.at( this->logit_tensor); GenericTensorAccessorW loss_tensor_backing = this->training_backing.local_tensor_backing.tensor_backings.at( TensorTypeVariant{loss_tensor}); - - std::cout << "Loss (logit grad) tensor" << std::endl; - std::cout << format_accessor_w_contents(loss_tensor_backing) << std::endl; PerLayerElapsedTime per_layer_elapsed_time; for (layer_guid_t const &node : reversed( @@ -70,12 +66,6 @@ GenericTensorAccessorR ModelTrainingInstance::get_loss_tensor_accessor() const { GenericTensorAccessorW logit_tensor_backing = this->training_backing .local_tensor_backing.tensor_backings.at(TensorTypeVariant{this->logit_tensor}); - // for (auto const &pair : - // this->training_backing.local_tensor_backing.tensor_backings) { - // std::cout << "Tensor type: " << pair.first << std::endl; - // std::cout << "Tensor " << std::endl; - // std::cout << format_accessor_w_contents(pair.second) << std::endl; - // } gradient_tensor_t loss_tensor = this->training_backing.local_tensor_backing.tensor_gradient_mapping.at( @@ -84,8 +74,6 @@ GenericTensorAccessorR ModelTrainingInstance::get_loss_tensor_accessor() const { this->training_backing.local_tensor_backing.tensor_backings.at( TensorTypeVariant{loss_tensor}); - std::cout << "Loss (logit grad) tensor" << std::endl; - std::cout << format_accessor_w_contents(loss_tensor_backing) << std::endl; return read_only_accessor_from_write_accessor(loss_tensor_backing); } diff --git a/lib/local-execution/test/src/test_e2e.cc b/lib/local-execution/test/src/test_e2e.cc index 80b2e6a398..de759e2e01 100644 --- a/lib/local-execution/test/src/test_e2e.cc +++ b/lib/local-execution/test/src/test_e2e.cc @@ -173,12 +173,9 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { // Assert that each sample in the batch has a lower loss in last epoch than // the first epoch - std::cout << "Final loss values" << std::endl; GenericTensorAccessorR first_epoch_loss = loss_values.at(0); - std::cout << format_accessor_r_contents(first_epoch_loss) << std::endl; GenericTensorAccessorR last_epoch = loss_values.back(); - std::cout << format_accessor_r_contents(last_epoch) << std::endl; CHECK(did_loss_decrease(first_epoch_loss, last_epoch)); } diff --git a/lib/task-spec/src/task-spec/ops/linear.cc b/lib/task-spec/src/task-spec/ops/linear.cc index b56931b3f3..e8be7781f5 100644 --- a/lib/task-spec/src/task-spec/ops/linear.cc +++ b/lib/task-spec/src/task-spec/ops/linear.cc @@ -91,12 +91,6 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto weight = acc.get_tensor(WEIGHT); auto output = acc.get_tensor(OUTPUT); - std::cout << "Input tensor" << std::endl; - std::cout << format_accessor_r_contents(input) << std::endl; - - std::cout << "Weight tensor" << std::endl; - std::cout << format_accessor_r_contents(weight) << std::endl; - auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); @@ -123,9 +117,6 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { in_dim.int_from_positive_int(), out_dim.int_from_positive_int(), batch_size.int_from_positive_int()); - - std::cout << "Output tensor" << std::endl; - std::cout << format_accessor_w_contents(output) << std::endl; return result; } @@ -140,12 +131,6 @@ static std::optional auto weight_grad = acc.get_tensor_grad(WEIGHT); auto output_grad = acc.get_tensor_grad(OUTPUT); - std::cout << "output grad tensor" << std::endl; - std::cout << format_accessor_w_contents(output_grad) << std::endl; - - std::cout << "weight grad tensor" << std::endl; - std::cout << format_accessor_w_contents(weight_grad) << std::endl; - auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); @@ -175,11 +160,6 @@ static std::optional in_dim.int_from_positive_int(), out_dim.int_from_positive_int(), batch_size.int_from_positive_int()); - std::cout << "output grad tensor after backward kernel" << std::endl; - std::cout << format_accessor_w_contents(output_grad) << std::endl; - - std::cout << "weight grad tensor after backward kernel" << std::endl; - std::cout << format_accessor_w_contents(weight_grad) << std::endl; return result; } From 346f986e0db5f9f582311e6b06f8dccaa86fc943 Mon Sep 17 00:00:00 2001 From: fruitea Date: Wed, 18 Jun 2025 11:37:24 -0700 Subject: [PATCH 85/91] tweak: minor --- .../allocated_tensors.struct.toml | 1 - .../fwd_bwd_op_task_impl_function.h | 32 -- .../generic_task_impl_function.h | 33 -- .../init_op_task_impl_function.h | 33 -- .../local-execution/itask_argument_accessor.h | 32 -- .../local-execution/local_cpu_allocator.h | 22 -- .../local_task_argument_accessor.h | 2 +- .../local_tensor_backing.struct.toml | 3 - .../include/local-execution/loss_functions.h | 2 +- .../local-execution/loss_tensor_source.h | 3 +- .../local-execution/model_training_instance.h | 2 +- .../include/local-execution/ops/attention.h | 26 -- .../local-execution/ops/batch_matmul.h | 24 -- .../include/local-execution/ops/batch_norm.h | 26 -- .../include/local-execution/ops/cast.h | 37 -- .../include/local-execution/ops/combine.h | 23 -- .../include/local-execution/ops/concat.h | 23 -- .../include/local-execution/ops/conv_2d.h | 26 -- .../include/local-execution/ops/dropout.h | 27 -- .../local-execution/ops/element_binary.h | 26 -- .../local-execution/ops/element_unary.h | 26 -- .../include/local-execution/ops/embedding.h | 23 -- .../include/local-execution/ops/flat.h | 23 -- .../include/local-execution/ops/gather.h | 26 -- .../include/local-execution/ops/input.h | 13 - .../include/local-execution/ops/layer_norm.h | 26 -- .../include/local-execution/ops/linear.h | 26 -- .../include/local-execution/ops/noop.h | 13 - .../include/local-execution/ops/parallel_op.h | 40 -- .../include/local-execution/ops/pool_2d.h | 26 -- .../include/local-execution/ops/reduce.h | 26 -- .../include/local-execution/ops/reduction.h | 24 -- .../include/local-execution/ops/repartition.h | 26 -- .../include/local-execution/ops/replicate.h | 23 -- .../include/local-execution/ops/reshape.h | 26 -- .../include/local-execution/ops/reverse.h | 23 -- .../include/local-execution/ops/softmax.h | 26 -- .../include/local-execution/ops/split.h | 23 -- .../include/local-execution/ops/topk.h | 26 -- .../include/local-execution/ops/transpose.h | 23 -- .../include/local-execution/ops/weight.h | 13 - .../include/local-execution/optimizer.h | 2 +- .../include/local-execution/permissions.h | 54 --- .../privilege_tensor_accessor.h | 39 -- .../local-execution/task_argument_accessor.h | 153 -------- .../task_impl_function.variant.toml | 26 -- .../local-execution/task_registry.struct.toml | 2 +- .../local-execution/task_signature_impl.h | 20 - .../task_signature_impl.struct.toml | 20 - .../local-execution/tracked_allocator.h | 3 + lib/local-execution/src/allocated_tensors.cc | 11 +- .../src/fwd_bwd_op_task_impl_function.cc | 54 --- .../src/generic_task_impl_function.cc | 53 --- .../src/init_op_task_impl_function.cc | 53 --- .../src/local-execution/ops/attention.cc | 259 ------------- .../src/local-execution/ops/batch_matmul.cc | 194 ---------- .../src/local-execution/ops/batch_norm.cc | 196 ---------- .../src/local-execution/ops/cast.cc | 110 ------ .../src/local-execution/ops/combine.cc | 94 ----- .../src/local-execution/ops/concat.cc | 107 ----- .../src/local-execution/ops/conv_2d.cc | 184 --------- .../src/local-execution/ops/dropout.cc | 134 ------- .../src/local-execution/ops/element_binary.cc | 180 --------- .../src/local-execution/ops/element_unary.cc | 165 -------- .../src/local-execution/ops/flat.cc | 81 ---- .../src/local-execution/ops/gather.cc | 174 --------- .../src/local-execution/ops/input.cc | 9 - .../src/local-execution/ops/layer_norm.cc | 190 --------- .../src/local-execution/ops/linear.cc | 210 ---------- .../src/local-execution/ops/noop.cc | 24 -- .../src/local-execution/ops/pool_2d.cc | 176 --------- .../src/local-execution/ops/reduce.cc | 148 ------- .../src/local-execution/ops/reduction.cc | 101 ----- .../src/local-execution/ops/repartition.cc | 137 ------- .../src/local-execution/ops/replicate.cc | 99 ----- .../src/local-execution/ops/reshape.cc | 132 ------- .../src/local-execution/ops/reverse.cc | 135 ------- .../src/local-execution/ops/softmax.cc | 153 -------- .../src/local-execution/ops/split.cc | 140 ------- .../src/local-execution/ops/topk.cc | 162 -------- .../src/local-execution/ops/transpose.cc | 107 ----- .../src/local-execution/ops/weight.cc | 9 - .../src/local_cpu_allocator.cc | 24 -- .../src/local_task_argument_accessor.cc | 7 +- .../src/local_training_backing.cc | 2 +- lib/local-execution/src/loss_functions.cc | 70 ++-- lib/local-execution/src/loss_tensor_source.cc | 2 +- .../src/model_training_instance.cc | 21 +- lib/local-execution/src/optimizer.cc | 26 +- .../src/per_device_op_state.cc | 0 lib/local-execution/src/permissions.cc | 72 ---- lib/local-execution/src/task_registry.cc | 2 +- .../src/task_signature_impl.cc | 366 ------------------ lib/local-execution/src/tracked_allocator.cc | 7 +- .../src/unallocated_tensors.cc | 1 - lib/local-execution/test/CMakeLists.txt | 7 +- .../test/modify_test_commands.cmake | 21 - .../test/src/test_allocated_tensors.cc | 11 +- lib/local-execution/test/src/test_e2e.cc | 104 ++--- .../test/src/test_local_cost_estimator.cc | 21 +- .../test/src/test_local_task_arg_accessor.cc | 17 +- .../test/src/test_local_tensor_backing.cc | 8 +- .../test/src/test_loss_functions.cc | 24 +- .../test/src/test_task_registry.cc | 10 +- .../test/src/test_unallocated_tensors.cc | 11 +- lib/local-execution/test/src/test_update.cc | 18 +- 106 files changed, 209 insertions(+), 5847 deletions(-) delete mode 100644 lib/local-execution/include/local-execution/fwd_bwd_op_task_impl_function.h delete mode 100644 lib/local-execution/include/local-execution/generic_task_impl_function.h delete mode 100644 lib/local-execution/include/local-execution/init_op_task_impl_function.h delete mode 100644 lib/local-execution/include/local-execution/itask_argument_accessor.h delete mode 100644 lib/local-execution/include/local-execution/local_cpu_allocator.h delete mode 100644 lib/local-execution/include/local-execution/ops/attention.h delete mode 100644 lib/local-execution/include/local-execution/ops/batch_matmul.h delete mode 100644 lib/local-execution/include/local-execution/ops/batch_norm.h delete mode 100644 lib/local-execution/include/local-execution/ops/cast.h delete mode 100644 lib/local-execution/include/local-execution/ops/combine.h delete mode 100644 lib/local-execution/include/local-execution/ops/concat.h delete mode 100644 lib/local-execution/include/local-execution/ops/conv_2d.h delete mode 100644 lib/local-execution/include/local-execution/ops/dropout.h delete mode 100644 lib/local-execution/include/local-execution/ops/element_binary.h delete mode 100644 lib/local-execution/include/local-execution/ops/element_unary.h delete mode 100644 lib/local-execution/include/local-execution/ops/embedding.h delete mode 100644 lib/local-execution/include/local-execution/ops/flat.h delete mode 100644 lib/local-execution/include/local-execution/ops/gather.h delete mode 100644 lib/local-execution/include/local-execution/ops/input.h delete mode 100644 lib/local-execution/include/local-execution/ops/layer_norm.h delete mode 100644 lib/local-execution/include/local-execution/ops/linear.h delete mode 100644 lib/local-execution/include/local-execution/ops/noop.h delete mode 100644 lib/local-execution/include/local-execution/ops/parallel_op.h delete mode 100644 lib/local-execution/include/local-execution/ops/pool_2d.h delete mode 100644 lib/local-execution/include/local-execution/ops/reduce.h delete mode 100644 lib/local-execution/include/local-execution/ops/reduction.h delete mode 100644 lib/local-execution/include/local-execution/ops/repartition.h delete mode 100644 lib/local-execution/include/local-execution/ops/replicate.h delete mode 100644 lib/local-execution/include/local-execution/ops/reshape.h delete mode 100644 lib/local-execution/include/local-execution/ops/reverse.h delete mode 100644 lib/local-execution/include/local-execution/ops/softmax.h delete mode 100644 lib/local-execution/include/local-execution/ops/split.h delete mode 100644 lib/local-execution/include/local-execution/ops/topk.h delete mode 100644 lib/local-execution/include/local-execution/ops/transpose.h delete mode 100644 lib/local-execution/include/local-execution/ops/weight.h delete mode 100644 lib/local-execution/include/local-execution/permissions.h delete mode 100644 lib/local-execution/include/local-execution/privilege_tensor_accessor.h delete mode 100644 lib/local-execution/include/local-execution/task_argument_accessor.h delete mode 100644 lib/local-execution/include/local-execution/task_impl_function.variant.toml delete mode 100644 lib/local-execution/include/local-execution/task_signature_impl.h delete mode 100644 lib/local-execution/include/local-execution/task_signature_impl.struct.toml delete mode 100644 lib/local-execution/src/fwd_bwd_op_task_impl_function.cc delete mode 100644 lib/local-execution/src/generic_task_impl_function.cc delete mode 100644 lib/local-execution/src/init_op_task_impl_function.cc delete mode 100644 lib/local-execution/src/local-execution/ops/attention.cc delete mode 100644 lib/local-execution/src/local-execution/ops/batch_matmul.cc delete mode 100644 lib/local-execution/src/local-execution/ops/batch_norm.cc delete mode 100644 lib/local-execution/src/local-execution/ops/cast.cc delete mode 100644 lib/local-execution/src/local-execution/ops/combine.cc delete mode 100644 lib/local-execution/src/local-execution/ops/concat.cc delete mode 100644 lib/local-execution/src/local-execution/ops/conv_2d.cc delete mode 100644 lib/local-execution/src/local-execution/ops/dropout.cc delete mode 100644 lib/local-execution/src/local-execution/ops/element_binary.cc delete mode 100644 lib/local-execution/src/local-execution/ops/element_unary.cc delete mode 100644 lib/local-execution/src/local-execution/ops/flat.cc delete mode 100644 lib/local-execution/src/local-execution/ops/gather.cc delete mode 100644 lib/local-execution/src/local-execution/ops/input.cc delete mode 100644 lib/local-execution/src/local-execution/ops/layer_norm.cc delete mode 100644 lib/local-execution/src/local-execution/ops/linear.cc delete mode 100644 lib/local-execution/src/local-execution/ops/noop.cc delete mode 100644 lib/local-execution/src/local-execution/ops/pool_2d.cc delete mode 100644 lib/local-execution/src/local-execution/ops/reduce.cc delete mode 100644 lib/local-execution/src/local-execution/ops/reduction.cc delete mode 100644 lib/local-execution/src/local-execution/ops/repartition.cc delete mode 100644 lib/local-execution/src/local-execution/ops/replicate.cc delete mode 100644 lib/local-execution/src/local-execution/ops/reshape.cc delete mode 100644 lib/local-execution/src/local-execution/ops/reverse.cc delete mode 100644 lib/local-execution/src/local-execution/ops/softmax.cc delete mode 100644 lib/local-execution/src/local-execution/ops/split.cc delete mode 100644 lib/local-execution/src/local-execution/ops/topk.cc delete mode 100644 lib/local-execution/src/local-execution/ops/transpose.cc delete mode 100644 lib/local-execution/src/local-execution/ops/weight.cc delete mode 100644 lib/local-execution/src/local_cpu_allocator.cc rename lib/{task-spec => local-execution}/src/per_device_op_state.cc (100%) delete mode 100644 lib/local-execution/src/permissions.cc delete mode 100644 lib/local-execution/src/task_signature_impl.cc delete mode 100644 lib/local-execution/test/modify_test_commands.cmake diff --git a/lib/local-execution/include/local-execution/allocated_tensors.struct.toml b/lib/local-execution/include/local-execution/allocated_tensors.struct.toml index 09245097b4..33985b0d74 100644 --- a/lib/local-execution/include/local-execution/allocated_tensors.struct.toml +++ b/lib/local-execution/include/local-execution/allocated_tensors.struct.toml @@ -3,7 +3,6 @@ name = "AllocatedTensors" features = [ "eq", "fmt", - "hash", ] includes = [ diff --git a/lib/local-execution/include/local-execution/fwd_bwd_op_task_impl_function.h b/lib/local-execution/include/local-execution/fwd_bwd_op_task_impl_function.h deleted file mode 100644 index cc82291f6a..0000000000 --- a/lib/local-execution/include/local-execution/fwd_bwd_op_task_impl_function.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_FWD_BWD_TASK_IMPL_FUNCTION_H -#define _FLEXFLOW_LOCAL_EXECUTION_FWD_BWD_TASK_IMPL_FUNCTION_H - -#include "local-execution/task_argument_accessor.h" - -namespace FlexFlow { - -struct FwdBwdOpTaskImplFunction { - - std::optional (*function_ptr)(TaskArgumentAccessor const &); - - bool operator==(FwdBwdOpTaskImplFunction const &) const; - bool operator!=(FwdBwdOpTaskImplFunction const &) const; - bool operator<(FwdBwdOpTaskImplFunction const &) const; - bool operator>(FwdBwdOpTaskImplFunction const &) const; - bool operator<=(FwdBwdOpTaskImplFunction const &) const; - bool operator>=(FwdBwdOpTaskImplFunction const &) const; -}; - -std::string format_as(FwdBwdOpTaskImplFunction const &x); -std::ostream &operator<<(std::ostream &s, FwdBwdOpTaskImplFunction const &x); - -} // namespace FlexFlow - -namespace std { -template <> -struct hash<::FlexFlow::FwdBwdOpTaskImplFunction> { - size_t operator()(::FlexFlow::FwdBwdOpTaskImplFunction const &) const; -}; -} // namespace std - -#endif diff --git a/lib/local-execution/include/local-execution/generic_task_impl_function.h b/lib/local-execution/include/local-execution/generic_task_impl_function.h deleted file mode 100644 index 9ce22ecf54..0000000000 --- a/lib/local-execution/include/local-execution/generic_task_impl_function.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_GENERIC_TASK_IMPL_FUNCTION_H -#define _FLEXFLOW_LOCAL_EXECUTION_GENERIC_TASK_IMPL_FUNCTION_H - -#include "local-execution/task_argument_accessor.h" -#include "task-spec/device_specific_device_states.dtg.h" - -namespace FlexFlow { - -struct GenericTaskImplFunction { - - void (*function_ptr)(TaskArgumentAccessor const &); - - bool operator==(GenericTaskImplFunction const &) const; - bool operator!=(GenericTaskImplFunction const &) const; - bool operator<(GenericTaskImplFunction const &) const; - bool operator>(GenericTaskImplFunction const &) const; - bool operator<=(GenericTaskImplFunction const &) const; - bool operator>=(GenericTaskImplFunction const &) const; -}; - -std::string format_as(GenericTaskImplFunction const &x); -std::ostream &operator<<(std::ostream &s, GenericTaskImplFunction const &x); - -} // namespace FlexFlow - -namespace std { -template <> -struct hash<::FlexFlow::GenericTaskImplFunction> { - size_t operator()(::FlexFlow::GenericTaskImplFunction const &) const; -}; -} // namespace std - -#endif diff --git a/lib/local-execution/include/local-execution/init_op_task_impl_function.h b/lib/local-execution/include/local-execution/init_op_task_impl_function.h deleted file mode 100644 index 0481e31a5f..0000000000 --- a/lib/local-execution/include/local-execution/init_op_task_impl_function.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_INIT_TASK_IMPL_FUNCTION_H -#define _FLEXFLOW_LOCAL_EXECUTION_INIT_TASK_IMPL_FUNCTION_H - -#include "local-execution/task_argument_accessor.h" -#include "task-spec/device_specific_device_states.dtg.h" - -namespace FlexFlow { - -struct InitOpTaskImplFunction { - - DeviceSpecificDeviceStates (*function_ptr)(TaskArgumentAccessor const &); - - bool operator==(InitOpTaskImplFunction const &) const; - bool operator!=(InitOpTaskImplFunction const &) const; - bool operator<(InitOpTaskImplFunction const &) const; - bool operator>(InitOpTaskImplFunction const &) const; - bool operator<=(InitOpTaskImplFunction const &) const; - bool operator>=(InitOpTaskImplFunction const &) const; -}; - -std::string format_as(InitOpTaskImplFunction const &x); -std::ostream &operator<<(std::ostream &s, InitOpTaskImplFunction const &x); - -} // namespace FlexFlow - -namespace std { -template <> -struct hash<::FlexFlow::InitOpTaskImplFunction> { - size_t operator()(::FlexFlow::InitOpTaskImplFunction const &) const; -}; -} // namespace std - -#endif diff --git a/lib/local-execution/include/local-execution/itask_argument_accessor.h b/lib/local-execution/include/local-execution/itask_argument_accessor.h deleted file mode 100644 index 24b3b3a37f..0000000000 --- a/lib/local-execution/include/local-execution/itask_argument_accessor.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_ITASK_ARGUMENT_ACCESSOR_H -#define _FLEXFLOW_LOCAL_EXECUTION_ITASK_ARGUMENT_ACCESSOR_H - -#include "kernels/allocation.h" -#include "local-execution/privilege_tensor_accessor.h" -#include "task-spec/concrete_arg.h" -#include "task-spec/op_task_signature.h" -#include "task-spec/tensor_type.dtg.h" - -namespace FlexFlow { - -struct ITaskArgumentAccessor { - ITaskArgumentAccessor &operator=(ITaskArgumentAccessor const &) = delete; - - virtual ~ITaskArgumentAccessor() = default; - - virtual ConcreteArgSpec const &get_concrete_arg(slot_id_t) const = 0; - - virtual GenericTensorAccessor get_tensor(slot_id_t slot, - Permissions priv, - TensorType tensor_type) const = 0; - virtual VariadicGenericTensorAccessor get_variadic_tensor( - slot_id_t slot, Permissions priv, TensorType tensor_type) const = 0; - - virtual Allocator get_allocator() const = 0; - virtual size_t get_device_idx() const = 0; -}; -CHECK_RC_COPY_VIRTUAL_COMPLIANT(ITaskArgumentAccessor); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/local_cpu_allocator.h b/lib/local-execution/include/local-execution/local_cpu_allocator.h deleted file mode 100644 index d1e81facf2..0000000000 --- a/lib/local-execution/include/local-execution/local_cpu_allocator.h +++ /dev/null @@ -1,22 +0,0 @@ -#include "kernels/allocation.h" -#include - -namespace FlexFlow { - -struct LocalCPUAllocator : public IAllocator { - LocalCPUAllocator() = default; - LocalCPUAllocator(LocalCPUAllocator const &) = delete; - LocalCPUAllocator(LocalCPUAllocator &&) = delete; - ~LocalCPUAllocator() = default; - - void *allocate(size_t) override; - void deallocate(void *) override; - -private: - std::unordered_map> ptrs; -}; -CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalCPUAllocator); - -Allocator create_local_cpu_memory_allocator(); - -} // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/local_task_argument_accessor.h b/lib/local-execution/include/local-execution/local_task_argument_accessor.h index c46534330b..184bf0b559 100644 --- a/lib/local-execution/include/local-execution/local_task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/local_task_argument_accessor.h @@ -1,8 +1,8 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H -#include "local-execution/task_argument_accessor.h" #include "task-spec/slot_tensor_type_id.dtg.h" +#include "task-spec/task_argument_accessor.h" #include #include diff --git a/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml b/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml index c34063af5d..bd59ec325d 100644 --- a/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml +++ b/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml @@ -3,7 +3,6 @@ name = "LocalTensorBacking" features = [ "eq", "fmt", - "hash" ] includes = [ @@ -15,9 +14,7 @@ includes = [ ] src_includes = [ - "utils/hash/unordered_map.h", "utils/fmt/unordered_map.h", - "utils/hash/vector.h", "utils/fmt/vector.h", ] diff --git a/lib/local-execution/include/local-execution/loss_functions.h b/lib/local-execution/include/local-execution/loss_functions.h index c06908503a..c75d4414de 100644 --- a/lib/local-execution/include/local-execution/loss_functions.h +++ b/lib/local-execution/include/local-execution/loss_functions.h @@ -16,10 +16,10 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_ #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_ -#include "local-execution/task_impl_function.dtg.h" #include "op-attrs/ops/loss_functions.h" #include "pcg/tensor_guid_t.dtg.h" #include "task-spec/loss_tensor_t.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "task-spec/task_invocation.dtg.h" #include "task-spec/task_signature.h" diff --git a/lib/local-execution/include/local-execution/loss_tensor_source.h b/lib/local-execution/include/local-execution/loss_tensor_source.h index d9858cde40..b794207c7f 100644 --- a/lib/local-execution/include/local-execution/loss_tensor_source.h +++ b/lib/local-execution/include/local-execution/loss_tensor_source.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_LOCAL_EXECUTION_LOSS_TENSOR_SOURCE_H #include "task-spec/loss_tensor_t.dtg.h" +#include "utils/nonnegative_int/nonnegative_int.h" namespace FlexFlow { @@ -12,7 +13,7 @@ struct LossTensorSource { loss_tensor_t new_loss_tensor(); private: - static size_t next_available_loss_tensor_id; + static nonnegative_int next_available_loss_tensor_id; }; } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h index 2deed6b0a2..6f8f4b1543 100644 --- a/lib/local-execution/include/local-execution/model_training_instance.h +++ b/lib/local-execution/include/local-execution/model_training_instance.h @@ -30,7 +30,7 @@ struct ModelTrainingInstance { PerLayerElapsedTime forward(); PerLayerElapsedTime backward(); void update(); - void write_loss_tensor_to_host(float *host_ptr); + GenericTensorAccessorR get_loss_tensor_accessor() const; }; } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/ops/attention.h b/lib/local-execution/include/local-execution/ops/attention.h deleted file mode 100644 index bf5385f609..0000000000 --- a/lib/local-execution/include/local-execution/ops/attention.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef _FLEXFLOW_ATTENTION_H -#define _FLEXFLOW_ATTENTION_H - -#include "local-execution/task_impl_function.dtg.h" -#include "op-attrs/ops/attention.h" -#include "task-spec/op_task_invocation.h" - -namespace FlexFlow { - -std::vector get_task_ids(MultiHeadAttentionAttrs const &); - -TaskImplFunction get_attention_init_task_impl(); -TaskImplFunction get_attention_fwd_task_impl(); -TaskImplFunction get_attention_bwd_task_impl(); - -OpTaskSignature get_attention_init_signature(); -OpTaskSignature get_attention_fwd_signature(); -OpTaskSignature get_attention_bwd_signature(); - -OpTaskInvocation init(MultiHeadAttentionAttrs const &); -OpTaskInvocation forward(MultiHeadAttentionAttrs const &); -OpTaskInvocation backward(MultiHeadAttentionAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/batch_matmul.h b/lib/local-execution/include/local-execution/ops/batch_matmul.h deleted file mode 100644 index 64d220ab66..0000000000 --- a/lib/local-execution/include/local-execution/ops/batch_matmul.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef _FLEXFLOW_BATCH_MATMUL_H -#define _FLEXFLOW_BATCH_MATMUL_H - -#include "local-execution/task_impl_function.dtg.h" -#include "op-attrs/ops/batch_matmul_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" -#include "task-spec/op_task_signature.h" - -namespace FlexFlow { - -std::vector get_task_ids(BatchMatmulAttrs const &); - -TaskImplFunction get_batch_matmul_fwd_task_impl(); -TaskImplFunction get_batch_matmul_bwd_task_impl(); - -OpTaskSignature get_batch_matmul_fwd_signature(); -OpTaskSignature get_batch_matmul_bwd_signature(); - -OpTaskInvocation forward(BatchMatmulAttrs const &); -OpTaskInvocation backward(BatchMatmulAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/batch_norm.h b/lib/local-execution/include/local-execution/ops/batch_norm.h deleted file mode 100644 index 85a7190ce1..0000000000 --- a/lib/local-execution/include/local-execution/ops/batch_norm.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef _FLEXFLOW_BATCH_NORM_H -#define _FLEXFLOW_BATCH_NORM_H - -#include "local-execution/task_impl_function.dtg.h" -#include "op-attrs/ops/batch_norm_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" - -namespace FlexFlow { - -std::vector get_task_ids(BatchNormAttrs const &); - -TaskImplFunction get_batch_norm_init_task_impl(); -TaskImplFunction get_batch_norm_fwd_task_impl(); -TaskImplFunction get_batch_norm_bwd_task_impl(); - -OpTaskSignature get_batch_norm_init_signature(); -OpTaskSignature get_batch_norm_fwd_signature(); -OpTaskSignature get_batch_norm_bwd_signature(); - -OpTaskInvocation init(BatchNormAttrs const &); -OpTaskInvocation forward(BatchNormAttrs const &); -OpTaskInvocation backward(BatchNormAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/cast.h b/lib/local-execution/include/local-execution/ops/cast.h deleted file mode 100644 index 6a27ad267a..0000000000 --- a/lib/local-execution/include/local-execution/ops/cast.h +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef _FLEXFLOW_CAST_H -#define _FLEXFLOW_CAST_H - -#include "local-execution/task_impl_function.dtg.h" -#include "op-attrs/ops/cast_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" - -namespace FlexFlow { - -std::vector get_task_ids(CastAttrs const &); - -TaskImplFunction get_cast_fwd_task_impl(); -TaskImplFunction get_cast_bwd_task_impl(); - -OpTaskSignature get_cast_fwd_signature(); -OpTaskSignature get_cast_bwd_signature(); - -OpTaskInvocation forward(CastAttrs const &); -OpTaskInvocation backward(CastAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/combine.h b/lib/local-execution/include/local-execution/ops/combine.h deleted file mode 100644 index 00e9cbed2c..0000000000 --- a/lib/local-execution/include/local-execution/ops/combine.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef _FLEXFLOW_COMBINE_H -#define _FLEXFLOW_COMBINE_H - -#include "local-execution/task_impl_function.dtg.h" -#include "op-attrs/ops/combine_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" - -namespace FlexFlow { - -std::vector get_task_ids(CombineAttrs const &); - -TaskImplFunction get_combine_fwd_task_impl(); -TaskImplFunction get_combine_bwd_task_impl(); - -OpTaskSignature get_combine_fwd_signature(); -OpTaskSignature get_combine_bwd_signature(); - -OpTaskInvocation forward(CombineAttrs const &); -OpTaskInvocation backward(CombineAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/concat.h b/lib/local-execution/include/local-execution/ops/concat.h deleted file mode 100644 index c46164e417..0000000000 --- a/lib/local-execution/include/local-execution/ops/concat.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef _FLEXFLOW_CONCAT_H -#define _FLEXFLOW_CONCAT_H - -#include "local-execution/task_impl_function.dtg.h" -#include "op-attrs/ops/concat_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" - -namespace FlexFlow { - -std::vector get_task_ids(ConcatAttrs const &); - -TaskImplFunction get_concat_fwd_task_impl(); -TaskImplFunction get_concat_bwd_task_impl(); - -OpTaskSignature get_concat_fwd_signature(); -OpTaskSignature get_concat_bwd_signature(); - -OpTaskInvocation forward(ConcatAttrs const &); -OpTaskInvocation backward(ConcatAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/conv_2d.h b/lib/local-execution/include/local-execution/ops/conv_2d.h deleted file mode 100644 index f3bb34ffeb..0000000000 --- a/lib/local-execution/include/local-execution/ops/conv_2d.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef _FLEXFLOW_CONV_2D_H -#define _FLEXFLOW_CONV_2D_H - -#include "local-execution/task_impl_function.dtg.h" -#include "op-attrs/ops/conv_2d_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" - -namespace FlexFlow { - -std::vector get_task_ids(Conv2DAttrs const &); - -TaskImplFunction get_conv_2d_init_task_impl(); -TaskImplFunction get_conv_2d_fwd_task_impl(); -TaskImplFunction get_conv_2d_bwd_task_impl(); - -OpTaskSignature get_conv_2d_init_signature(); -OpTaskSignature get_conv_2d_fwd_signature(); -OpTaskSignature get_conv_2d_bwd_signature(); - -OpTaskInvocation init(Conv2DAttrs const &); -OpTaskInvocation forward(Conv2DAttrs const &); -OpTaskInvocation backward(Conv2DAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/dropout.h b/lib/local-execution/include/local-execution/ops/dropout.h deleted file mode 100644 index bd7b426c6b..0000000000 --- a/lib/local-execution/include/local-execution/ops/dropout.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef _FLEXFLOW_DROPOUT_H -#define _FLEXFLOW_DROPOUT_H - -#include "local-execution/task_impl_function.dtg.h" -#include "op-attrs/ops/dropout_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" -#include "task-spec/task_id_t.dtg.h" - -namespace FlexFlow { - -std::vector get_task_ids(DropoutAttrs const &); - -TaskImplFunction get_dropout_init_task_impl(); -TaskImplFunction get_dropout_fwd_task_impl(); -TaskImplFunction get_dropout_bwd_task_impl(); - -OpTaskSignature get_dropout_init_signature(); -OpTaskSignature get_dropout_fwd_signature(); -OpTaskSignature get_dropout_bwd_signature(); - -OpTaskInvocation init(DropoutAttrs const &); -OpTaskInvocation forward(DropoutAttrs const &); -OpTaskInvocation backward(DropoutAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/element_binary.h b/lib/local-execution/include/local-execution/ops/element_binary.h deleted file mode 100644 index 4e0bb46e47..0000000000 --- a/lib/local-execution/include/local-execution/ops/element_binary.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef _FLEXFLOW_ELEMENT_BINARY_H -#define _FLEXFLOW_ELEMENT_BINARY_H - -#include "local-execution/task_impl_function.dtg.h" -#include "local-execution/task_signature_impl.h" -#include "op-attrs/ops/element_binary_attrs.dtg.h" - -namespace FlexFlow { - -std::vector get_task_ids(ElementBinaryAttrs const &); - -OpTaskInvocation init(ElementBinaryAttrs const &); -OpTaskInvocation forward(ElementBinaryAttrs const &); -OpTaskInvocation backward(ElementBinaryAttrs const &); - -TaskImplFunction get_element_binary_init_task_impl(); -TaskImplFunction get_element_binary_fwd_task_impl(); -TaskImplFunction get_element_binary_bwd_task_impl(); - -OpTaskSignature get_element_binary_init_signature(); -OpTaskSignature get_element_binary_fwd_signature(); -OpTaskSignature get_element_binary_bwd_signature(); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/element_unary.h b/lib/local-execution/include/local-execution/ops/element_unary.h deleted file mode 100644 index 9900668d6c..0000000000 --- a/lib/local-execution/include/local-execution/ops/element_unary.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef _ELEMENT_UNARY_H -#define _ELEMENT_UNARY_H - -#include "local-execution/task_impl_function.dtg.h" -#include "op-attrs/ops/element_unary_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" - -namespace FlexFlow { - -std::vector get_task_ids(ElementUnaryAttrs const &); - -TaskImplFunction get_element_unary_init_task_impl(); -TaskImplFunction get_element_unary_fwd_task_impl(); -TaskImplFunction get_element_unary_bwd_task_impl(); - -OpTaskSignature get_element_unary_init_signature(); -OpTaskSignature get_element_unary_fwd_signature(); -OpTaskSignature get_element_unary_bwd_signature(); - -OpTaskInvocation init(ElementUnaryAttrs const &); -OpTaskInvocation forward(ElementUnaryAttrs const &); -OpTaskInvocation backward(ElementUnaryAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/embedding.h b/lib/local-execution/include/local-execution/ops/embedding.h deleted file mode 100644 index b998aef53e..0000000000 --- a/lib/local-execution/include/local-execution/ops/embedding.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef _FLEXFLOW_EMBEDDING_H -#define _FLEXFLOW_EMBEDDING_H - -#include "local-execution/task_impl_function.dtg.h" -#include "op-attrs/ops/embedding_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" - -namespace FlexFlow { - -std::vector get_task_ids(EmbeddingAttrs const &); - -TaskImplFunction get_embedding_fwd_task_impl(); -TaskImplFunction get_embedding_bwd_task_impl(); - -OpTaskSignature get_embedding_fwd_signature(); -OpTaskSignature get_embedding_bwd_signature(); - -OpTaskInvocation forward(EmbeddingAttrs const &); -OpTaskInvocation backward(EmbeddingAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/flat.h b/lib/local-execution/include/local-execution/ops/flat.h deleted file mode 100644 index 95afb98340..0000000000 --- a/lib/local-execution/include/local-execution/ops/flat.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef _FLEXFLOW_FLAT_H -#define _FLEXFLOW_FLAT_H - -#include "local-execution/task_impl_function.dtg.h" -#include "op-attrs/ops/flat_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" - -namespace FlexFlow { - -std::vector get_task_ids(FlatAttrs const &); - -TaskImplFunction get_flat_fwd_task_impl(); -TaskImplFunction get_flat_bwd_task_impl(); - -OpTaskSignature get_flat_fwd_signature(); -OpTaskSignature get_flat_bwd_signature(); - -OpTaskInvocation forward(FlatAttrs const &); -OpTaskInvocation backward(FlatAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/gather.h b/lib/local-execution/include/local-execution/ops/gather.h deleted file mode 100644 index 5569a94728..0000000000 --- a/lib/local-execution/include/local-execution/ops/gather.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef _FLEXFLOW_GATHER_H -#define _FLEXFLOW_GATHER_H - -#include "local-execution/task_impl_function.dtg.h" -#include "op-attrs/ops/gather_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" - -namespace FlexFlow { - -std::vector get_task_ids(GatherAttrs const &); - -TaskImplFunction get_gather_init_task_impl(); -TaskImplFunction get_gather_fwd_task_impl(); -TaskImplFunction get_gather_bwd_task_impl(); - -OpTaskSignature get_gather_init_signature(); -OpTaskSignature get_gather_fwd_signature(); -OpTaskSignature get_gather_bwd_signature(); - -OpTaskInvocation init(GatherAttrs const &); -OpTaskInvocation forward(GatherAttrs const &); -OpTaskInvocation backward(GatherAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/input.h b/lib/local-execution/include/local-execution/ops/input.h deleted file mode 100644 index 9181478363..0000000000 --- a/lib/local-execution/include/local-execution/ops/input.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef _FLEXFLOW_INPUT_H -#define _FLEXFLOW_INPUT_H - -#include "op-attrs/ops/input_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" - -namespace FlexFlow { - -std::vector get_task_ids(InputAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/layer_norm.h b/lib/local-execution/include/local-execution/ops/layer_norm.h deleted file mode 100644 index e4a15caac2..0000000000 --- a/lib/local-execution/include/local-execution/ops/layer_norm.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_OPS_LAYER_NORM_H -#define _FLEXFLOW_RUNTIME_SRC_OPS_LAYER_NORM_H - -#include "local-execution/task_impl_function.dtg.h" -#include "op-attrs/ops/layer_norm_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" - -namespace FlexFlow { - -std::vector get_task_ids(LayerNormAttrs const &); - -TaskImplFunction get_layer_norm_init_task_impl(); -TaskImplFunction get_layer_norm_fwd_task_impl(); -TaskImplFunction get_layer_norm_bwd_task_impl(); - -OpTaskSignature get_layer_norm_init_signature(); -OpTaskSignature get_layer_norm_fwd_signature(); -OpTaskSignature get_layer_norm_bwd_signature(); - -OpTaskInvocation init(LayerNormAttrs const &); -OpTaskInvocation forward(LayerNormAttrs const &); -OpTaskInvocation backward(LayerNormAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/linear.h b/lib/local-execution/include/local-execution/ops/linear.h deleted file mode 100644 index d58d876865..0000000000 --- a/lib/local-execution/include/local-execution/ops/linear.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef _FLEXFLOW_LINEAR_H -#define _FLEXFLOW_LINEAR_H - -#include "local-execution/task_impl_function.dtg.h" -#include "op-attrs/ops/linear_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" - -namespace FlexFlow { - -std::vector get_task_ids(LinearAttrs const &); - -OpTaskInvocation init(LinearAttrs const &); -OpTaskInvocation forward(LinearAttrs const &); -OpTaskInvocation backward(LinearAttrs const &); - -TaskImplFunction get_linear_init_task_impl(); -TaskImplFunction get_linear_fwd_task_impl(); -TaskImplFunction get_linear_bwd_task_impl(); - -OpTaskSignature get_linear_init_signature(); -OpTaskSignature get_linear_fwd_signature(); -OpTaskSignature get_linear_bwd_signature(); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/noop.h b/lib/local-execution/include/local-execution/ops/noop.h deleted file mode 100644 index adbc15cd3b..0000000000 --- a/lib/local-execution/include/local-execution/ops/noop.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef _FLEXFLOW_NOOP_H -#define _FLEXFLOW_NOOP_H - -#include "op-attrs/ops/noop_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" - -namespace FlexFlow { - -std::vector get_task_ids(NoopAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/parallel_op.h b/lib/local-execution/include/local-execution/ops/parallel_op.h deleted file mode 100644 index e7bd98b8a8..0000000000 --- a/lib/local-execution/include/local-execution/ops/parallel_op.h +++ /dev/null @@ -1,40 +0,0 @@ -#ifndef _FLEXFLOW_PARALLEL_OP_H -#define _FLEXFLOW_PARALLEL_OP_H - -#include "parallel_op_info.h" -#include "utils/optional.h" - -namespace FlexFlow { - -struct ParallelOpJoinResult { - std::optional op = std::nullopt; - bool join_did_succeed = false; -}; - -ParallelOpJoinResult try_join_parallel_ops(ParallelOpInfo const &, - ParallelOpInfo const &); - -/* class ParallelOp : public Op { */ -/* public: */ -/* ParallelOp(FFModel &model, */ -/* OperatorType type, */ -/* char const *_name, */ -/* const ParallelTensor input); */ -/* virtual void init(FFModel const &) = 0; */ -/* virtual void forward(FFModel const &) = 0; */ -/* virtual void backward(FFModel const &) = 0; */ -/* virtual void create_input_partition(FFModel &model) = 0; */ -/* virtual bool measure_operator_cost(Simulator *sim, */ -/* MachineView const &pc, */ -/* CostMetrics &cost_metrics) const = 0; */ -/* virtual bool append_parallel_op_info( */ -/* std::vector ¶llel_ops) const = 0; */ -/* virtual bool is_parallel_op() const; */ - -/* public: */ -/* Legion::LogicalPartition input_lp, output_grad_lp; */ -/* }; */ - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/pool_2d.h b/lib/local-execution/include/local-execution/ops/pool_2d.h deleted file mode 100644 index 7d0ec44bd7..0000000000 --- a/lib/local-execution/include/local-execution/ops/pool_2d.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef _FLEXFLOW_POOL_2D_H -#define _FLEXFLOW_POOL_2D_H - -#include "local-execution/task_impl_function.dtg.h" -#include "op-attrs/ops/pool_2d_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" - -namespace FlexFlow { - -std::vector get_task_ids(Pool2DAttrs const &); - -TaskImplFunction get_pool_2d_init_task_impl(); -TaskImplFunction get_pool_2d_fwd_task_impl(); -TaskImplFunction get_pool_2d_bwd_task_impl(); - -OpTaskSignature get_pool_2d_init_signature(); -OpTaskSignature get_pool_2d_fwd_signature(); -OpTaskSignature get_pool_2d_bwd_signature(); - -OpTaskInvocation init(Pool2DAttrs const &); -OpTaskInvocation forward(Pool2DAttrs const &); -OpTaskInvocation backward(Pool2DAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/reduce.h b/lib/local-execution/include/local-execution/ops/reduce.h deleted file mode 100644 index 5c6d4be338..0000000000 --- a/lib/local-execution/include/local-execution/ops/reduce.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H -#define _FLEXFLOW_RUNTIME_SRC_OPS_REDUCE_H - -#include "local-execution/task_impl_function.dtg.h" -#include "op-attrs/ops/reduce_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" - -namespace FlexFlow { - -std::vector get_task_ids(ReduceAttrs const &); - -TaskImplFunction get_reduce_init_task_impl(); -TaskImplFunction get_reduce_fwd_task_impl(); -TaskImplFunction get_reduce_bwd_task_impl(); - -OpTaskSignature get_reduce_init_signature(); -OpTaskSignature get_reduce_fwd_signature(); -OpTaskSignature get_reduce_bwd_signature(); - -OpTaskInvocation init(ReduceAttrs const &); -OpTaskInvocation forward(ReduceAttrs const &); -OpTaskInvocation backward(ReduceAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/reduction.h b/lib/local-execution/include/local-execution/ops/reduction.h deleted file mode 100644 index 7475d3aeb4..0000000000 --- a/lib/local-execution/include/local-execution/ops/reduction.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef _FLEXFLOW_REDUCTION_H -#define _FLEXFLOW_REDUCTION_H - -#include "local-execution/task_impl_function.dtg.h" -#include "op-attrs/ops/reduction_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" - -namespace FlexFlow { - -std::vector get_task_ids(ReductionAttrs const &); - -TaskImplFunction get_reduction_fwd_task_impl(); -TaskImplFunction get_reduction_bwd_task_impl(); - -OpTaskSignature get_reduction_fwd_signature(); -OpTaskSignature get_reduction_bwd_signature(); - -OpTaskInvocation init(ReductionAttrs const &); -OpTaskInvocation forward(ReductionAttrs const &); -OpTaskInvocation backward(ReductionAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/repartition.h b/lib/local-execution/include/local-execution/ops/repartition.h deleted file mode 100644 index 08ecdafcf2..0000000000 --- a/lib/local-execution/include/local-execution/ops/repartition.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef _FLEXFLOW_PARTITION_H -#define _FLEXFLOW_PARTITION_H - -#include "local-execution/task_impl_function.dtg.h" -#include "op-attrs/ops/repartition_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" - -namespace FlexFlow { - -std::vector get_task_ids(RepartitionAttrs const &); - -TaskImplFunction get_repartition_init_task_impl(); -TaskImplFunction get_repartition_fwd_task_impl(); -TaskImplFunction get_repartition_bwd_task_impl(); - -OpTaskSignature get_repartition_init_signature(); -OpTaskSignature get_repartition_fwd_signature(); -OpTaskSignature get_repartition_bwd_signature(); - -OpTaskInvocation init(RepartitionAttrs const &); -OpTaskInvocation forward(RepartitionAttrs const &); -OpTaskInvocation backward(RepartitionAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/replicate.h b/lib/local-execution/include/local-execution/ops/replicate.h deleted file mode 100644 index b827b9c272..0000000000 --- a/lib/local-execution/include/local-execution/ops/replicate.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef _FLEXFLOW_REPLICATE_H -#define _FLEXFLOW_REPLICATE_H - -#include "local-execution/task_impl_function.dtg.h" -#include "op-attrs/ops/replicate_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" - -namespace FlexFlow { - -std::vector get_task_ids(ReplicateAttrs const &); - -TaskImplFunction get_replicate_fwd_task_impl(); -TaskImplFunction get_replicate_bwd_task_impl(); - -OpTaskSignature get_replicate_fwd_signature(); -OpTaskSignature get_replicate_bwd_signature(); - -OpTaskInvocation forward(ReplicateAttrs const &); -OpTaskInvocation backward(ReplicateAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/reshape.h b/lib/local-execution/include/local-execution/ops/reshape.h deleted file mode 100644 index ed7e6e9e31..0000000000 --- a/lib/local-execution/include/local-execution/ops/reshape.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef _FLEXFLOW_RESHAPE_H -#define _FLEXFLOW_RESHAPE_H - -#include "local-execution/task_impl_function.dtg.h" -#include "op-attrs/ops/reshape_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" - -namespace FlexFlow { - -std::vector get_task_ids(ReshapeAttrs const &); - -TaskImplFunction get_reshape_init_task_impl(); -TaskImplFunction get_reshape_fwd_task_impl(); -TaskImplFunction get_reshape_bwd_task_impl(); - -OpTaskSignature get_reshape_init_signature(); -OpTaskSignature get_reshape_fwd_signature(); -OpTaskSignature get_reshape_bwd_signature(); - -OpTaskInvocation init(ReshapeAttrs const &); -OpTaskInvocation forward(ReshapeAttrs const &); -OpTaskInvocation backward(ReshapeAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/reverse.h b/lib/local-execution/include/local-execution/ops/reverse.h deleted file mode 100644 index dd0e89ecad..0000000000 --- a/lib/local-execution/include/local-execution/ops/reverse.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef _FLEXFLOW_REVERSE_H_ -#define _FLEXFLOW_REVERSE_H_ - -#include "local-execution/task_impl_function.dtg.h" -#include "op-attrs/ops/reverse_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" - -namespace FlexFlow { - -std::vector get_task_ids(ReverseAttrs const &); - -TaskImplFunction get_reverse_fwd_task_impl(); -TaskImplFunction get_reverse_bwd_task_impl(); - -OpTaskSignature get_reverse_fwd_signature(); -OpTaskSignature get_reverse_bwd_signature(); - -OpTaskInvocation forward(ReverseAttrs const &); -OpTaskInvocation backward(ReverseAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/softmax.h b/lib/local-execution/include/local-execution/ops/softmax.h deleted file mode 100644 index 294d948b42..0000000000 --- a/lib/local-execution/include/local-execution/ops/softmax.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef _FLEXFLOW_SOFTMAX_H -#define _FLEXFLOW_SOFTMAX_H - -#include "local-execution/task_impl_function.dtg.h" -#include "op-attrs/ops/softmax_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" - -namespace FlexFlow { - -std::vector get_task_ids(SoftmaxAttrs const &); - -TaskImplFunction get_softmax_init_task_impl(); -TaskImplFunction get_softmax_fwd_task_impl(); -TaskImplFunction get_softmax_bwd_task_impl(); - -OpTaskSignature get_softmax_init_signature(); -OpTaskSignature get_softmax_fwd_signature(); -OpTaskSignature get_softmax_bwd_signature(); - -OpTaskInvocation init(SoftmaxAttrs const &); -OpTaskInvocation forward(SoftmaxAttrs const &); -OpTaskInvocation backward(SoftmaxAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/split.h b/lib/local-execution/include/local-execution/ops/split.h deleted file mode 100644 index 49cd7cfc7b..0000000000 --- a/lib/local-execution/include/local-execution/ops/split.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef _FLEXFLOW_SPLIT_H -#define _FLEXFLOW_SPLIT_H - -#include "local-execution/task_impl_function.dtg.h" -#include "op-attrs/ops/split_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" - -namespace FlexFlow { - -std::vector get_task_ids(SplitAttrs const &); - -TaskImplFunction get_split_fwd_task_impl(); -TaskImplFunction get_split_bwd_task_impl(); - -OpTaskSignature get_split_fwd_signature(); -OpTaskSignature get_split_bwd_signature(); - -OpTaskInvocation forward(SplitAttrs const &); -OpTaskInvocation backward(SplitAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/topk.h b/lib/local-execution/include/local-execution/ops/topk.h deleted file mode 100644 index aeded512cd..0000000000 --- a/lib/local-execution/include/local-execution/ops/topk.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef _FLEXFLOW_TOPK_H_ -#define _FLEXFLOW_TOPK_H_ - -#include "local-execution/task_impl_function.dtg.h" -#include "op-attrs/ops/topk_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" - -namespace FlexFlow { - -std::vector get_task_ids(TopKAttrs const &); - -TaskImplFunction get_topk_init_task_impl(); -TaskImplFunction get_topk_fwd_task_impl(); -TaskImplFunction get_topk_bwd_task_impl(); - -OpTaskSignature get_topk_init_signature(); -OpTaskSignature get_topk_fwd_signature(); -OpTaskSignature get_topk_bwd_signature(); - -OpTaskInvocation init(TopKAttrs const &); -OpTaskInvocation forward(TopKAttrs const &); -OpTaskInvocation backward(TopKAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/transpose.h b/lib/local-execution/include/local-execution/ops/transpose.h deleted file mode 100644 index 2c7b5fb3bc..0000000000 --- a/lib/local-execution/include/local-execution/ops/transpose.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef _FLEXFLOW_TRANSPOSE_H_ -#define _FLEXFLOW_TRANSPOSE_H_ - -#include "local-execution/task_impl_function.dtg.h" -#include "op-attrs/ops/transpose_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" - -namespace FlexFlow { - -std::vector get_task_ids(TransposeAttrs const &); - -TaskImplFunction get_transpose_fwd_task_impl(); -TaskImplFunction get_transpose_bwd_task_impl(); - -OpTaskSignature get_transpose_fwd_signature(); -OpTaskSignature get_transpose_bwd_signature(); - -OpTaskInvocation forward(TransposeAttrs const &); -OpTaskInvocation backward(TransposeAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/ops/weight.h b/lib/local-execution/include/local-execution/ops/weight.h deleted file mode 100644 index 162236e41e..0000000000 --- a/lib/local-execution/include/local-execution/ops/weight.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef _FLEXFLOW_WEIGHT_H -#define _FLEXFLOW_WEIGHT_H - -#include "op-attrs/ops/weight_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" - -namespace FlexFlow { - -std::vector get_task_ids(WeightAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/local-execution/include/local-execution/optimizer.h index f6bd5a3ee9..e4a9c78743 100644 --- a/lib/local-execution/include/local-execution/optimizer.h +++ b/lib/local-execution/include/local-execution/optimizer.h @@ -1,10 +1,10 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_ #define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_ -#include "local-execution/task_impl_function.dtg.h" #include "pcg/optimizer_attrs.dtg.h" #include "pcg/optimizers/adam_optimizer_attrs.dtg.h" #include "pcg/optimizers/sgd_optimizer_attrs.dtg.h" +#include "task-spec/task_impl_function.dtg.h" #include "task-spec/task_invocation.dtg.h" #include "task-spec/task_signature.h" diff --git a/lib/local-execution/include/local-execution/permissions.h b/lib/local-execution/include/local-execution/permissions.h deleted file mode 100644 index f34969f233..0000000000 --- a/lib/local-execution/include/local-execution/permissions.h +++ /dev/null @@ -1,54 +0,0 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_PERMISSION_H -#define _FLEXFLOW_LOCAL_EXECUTION_PERMISSION_H - -#include "utils/exception.h" -#include "utils/fmt.h" - -namespace FlexFlow { - -enum class Permissions { NONE, RO, WO, RW }; - -Permissions join(Permissions lhs, Permissions rhs); -Permissions meet(Permissions lhs, Permissions rhs); - -bool operator<(Permissions lhs, Permissions rhs); -bool operator<=(Permissions lhs, Permissions rhs); -bool operator>(Permissions lhs, Permissions rhs); -bool operator>=(Permissions lhs, Permissions rhs); - -} // namespace FlexFlow - -namespace fmt { - -template <> -struct formatter<::FlexFlow::Permissions> : formatter { - template - auto format(::FlexFlow::Permissions p, FormatContext &ctx) const - -> decltype(ctx.out()) { - using ::FlexFlow::Permissions; - - string_view name = "unknown"; - switch (p) { - case Permissions::NONE: - name = "NO_PERMISSIONS"; - break; - case Permissions::RO: - name = "READ_ONLY"; - break; - case Permissions::WO: - name = "WRITE_ONLY"; - break; - case Permissions::RW: - name = "READ_WRITE"; - break; - default: - throw ::FlexFlow::mk_runtime_error( - fmt::format("Unknown permission {}", static_cast(p))); - } - return formatter::format(name, ctx); - } -}; - -} // namespace fmt - -#endif diff --git a/lib/local-execution/include/local-execution/privilege_tensor_accessor.h b/lib/local-execution/include/local-execution/privilege_tensor_accessor.h deleted file mode 100644 index aeae3c2e41..0000000000 --- a/lib/local-execution/include/local-execution/privilege_tensor_accessor.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_PRIVILEGE_TENSOR_ACCESSOR_H -#define _FLEXFLOW_LOCAL_EXECUTION_PRIVILEGE_TENSOR_ACCESSOR_H - -#include "kernels/accessor.h" -#include "local-execution/permissions.h" - -namespace FlexFlow { - -template -struct privilege_mode_to_accessor_t {}; - -template <> -struct privilege_mode_to_accessor_t { - using type = GenericTensorAccessorW; -}; - -template <> -struct privilege_mode_to_accessor_t { - using type = GenericTensorAccessorR; -}; - -template <> -struct privilege_mode_to_accessor_t { - using type = GenericTensorAccessorW; -}; - -template -using privilege_mode_to_accessor = - typename privilege_mode_to_accessor_t::type; - -using GenericTensorAccessor = - std::variant; -using VariadicGenericTensorAccessor = - std::variant, - std::vector>; - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/task_argument_accessor.h b/lib/local-execution/include/local-execution/task_argument_accessor.h deleted file mode 100644 index 499b5ff7d6..0000000000 --- a/lib/local-execution/include/local-execution/task_argument_accessor.h +++ /dev/null @@ -1,153 +0,0 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H -#define _FLEXFLOW_LOCAL_EXECUTION_TASK_ARGUMENT_ACCESSOR_H - -#include "local-execution/itask_argument_accessor.h" -#include "task-spec/device_specific.h" -#include "task-spec/per_device_op_state.dtg.h" - -namespace FlexFlow { - -struct TaskArgumentAccessor { - // arguments - template - T const &get_argument(slot_id_t slot) const { - if constexpr (PerDeviceOpState::IsPartOfPerDeviceOpState_v) { - PerDeviceOpState device_states = - this->ptr->get_concrete_arg(slot).get(); - if (device_states.has()) { - return device_states.get(); - } else { - throw mk_runtime_error(fmt::format( - "Invalid access to PerDeviceOpState attempted, instead it holds: ", - device_states.index())); - } - } else { - return this->ptr->get_concrete_arg(slot).get(); - } - } - - template - T const &get_argument(int slot) const { - return this->get_argument(slot_id_t{slot}); - } - - // tensors - template - privilege_mode_to_accessor get_tensor(int slot) const { - return this->get_tensor(slot_id_t{slot}); - } - - template - privilege_mode_to_accessor get_tensor(slot_id_t slot) const { - return std::get>( - this->ptr->get_tensor(slot, PRIV, TensorType::FORWARD)); - } - - template - privilege_mode_to_accessor get_tensor_grad(int slot) const { - return this->get_tensor_grad(slot_id_t{slot}); - } - - template - privilege_mode_to_accessor get_tensor_grad(slot_id_t slot) const { - return std::get>( - this->ptr->get_tensor(slot, PRIV, TensorType::GRADIENT)); - } - - template - privilege_mode_to_accessor get_optimizer_tensor(int slot) const { - return this->get_optimizer_tensor(slot_id_t{slot}); - } - - template - privilege_mode_to_accessor get_optimizer_tensor(slot_id_t slot) const { - return std::get>( - this->ptr->get_tensor(slot, PRIV, TensorType::OPTIMIZER)); - } - - template - privilege_mode_to_accessor get_loss_tensor(int slot) const { - return this->get_loss_tensor(slot_id_t{slot}); - } - - template - privilege_mode_to_accessor get_loss_tensor(slot_id_t slot) const { - return std::get>( - this->ptr->get_tensor(slot, PRIV, TensorType::LOSS)); - } - - // variadic tensors - template - std::vector> - get_variadic_tensor(int slot) const { - return this->get_variadic_tensor(slot_id_t{slot}); - } - - template - std::vector> - get_variadic_tensor(slot_id_t slot) const { - return std::get>>( - this->ptr->get_variadic_tensor(slot, PRIV, TensorType::FORWARD)); - } - - template - std::vector> - get_variadic_tensor_grad(int slot) const { - return this->get_variadic_tensor_grad(slot_id_t{slot}); - } - - template - std::vector> - get_variadic_tensor_grad(slot_id_t slot) const { - return std::get>>( - this->ptr->get_variadic_tensor(slot, PRIV, TensorType::GRADIENT)); - } - - template - std::vector> - get_variadic_optimizer_tensor(int slot) const { - return this->get_variadic_optimizer_tensor(slot_id_t{slot}); - } - - template - std::vector> - get_variadic_optimizer_tensor(slot_id_t slot) const { - return std::get>>( - this->ptr->get_variadic_tensor(slot, PRIV, TensorType::OPTIMIZER)); - } - - template - std::vector> - get_variadic_loss_tensor(int slot) const { - return this->get_variadic_loss_tensor(slot_id_t{slot}); - } - - template - std::vector> - get_variadic_loss_tensor(slot_id_t slot) const { - return std::get>>( - this->ptr->get_variadic_tensor(slot, PRIV, TensorType::LOSS)); - } - - Allocator get_allocator() const { - return this->ptr->get_allocator(); - } - - template - static - typename std::enable_if::value, - TaskArgumentAccessor>::type - create(Args &&...args) { - return TaskArgumentAccessor( - std::make_shared(std::forward(args)...)); - } - -private: - TaskArgumentAccessor(std::shared_ptr ptr) - : ptr(ptr) {} - std::shared_ptr ptr; -}; - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/task_impl_function.variant.toml b/lib/local-execution/include/local-execution/task_impl_function.variant.toml deleted file mode 100644 index 48cab9eb01..0000000000 --- a/lib/local-execution/include/local-execution/task_impl_function.variant.toml +++ /dev/null @@ -1,26 +0,0 @@ -namespace = "FlexFlow" -name = "TaskImplFunction" -features = [ - "eq", - "fmt", - "hash", - "ord" -] - -includes = [ - "local-execution/init_op_task_impl_function.h", - "local-execution/fwd_bwd_op_task_impl_function.h", - "local-execution/generic_task_impl_function.h", -] - -[[values]] -type = "::FlexFlow::InitOpTaskImplFunction" -key = "init_op_task_impl_function" - -[[values]] -type = "::FlexFlow::FwdBwdOpTaskImplFunction" -key = "fwd_bwd_op_task_impl_function" - -[[values]] -type = "::FlexFlow::GenericTaskImplFunction" -key = "generic_task_impl_function" diff --git a/lib/local-execution/include/local-execution/task_registry.struct.toml b/lib/local-execution/include/local-execution/task_registry.struct.toml index c3784b617f..f5daa62090 100644 --- a/lib/local-execution/include/local-execution/task_registry.struct.toml +++ b/lib/local-execution/include/local-execution/task_registry.struct.toml @@ -7,7 +7,7 @@ features = [ ] includes = [ - "local-execution/task_signature_impl.dtg.h", + "task-spec/task_signature_impl.dtg.h", "task-spec/task_id_t.dtg.h", "pcg/layer_guid_t.dtg.h", ] diff --git a/lib/local-execution/include/local-execution/task_signature_impl.h b/lib/local-execution/include/local-execution/task_signature_impl.h deleted file mode 100644 index 613a173f25..0000000000 --- a/lib/local-execution/include/local-execution/task_signature_impl.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_SIGNATURE_IMPL_H -#define _FLEXFLOW_LOCAL_EXECUTION_TASK_SIGNATURE_IMPL_H - -#include "local-execution/task_signature_impl.dtg.h" -#include "op-attrs/computation_graph_op_attrs.h" -#include "task-spec/op_task_invocation.h" -#include "task-spec/task_id_t.dtg.h" - -namespace FlexFlow { - -TaskSignatureAndImpl get_task_sig_impl(task_id_t const &); -std::vector get_task_ids(ComputationGraphOpAttrs const &); - -OpTaskInvocation init(ComputationGraphOpAttrs const &); -OpTaskInvocation forward(ComputationGraphOpAttrs const &); -OpTaskInvocation backward(ComputationGraphOpAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/task_signature_impl.struct.toml b/lib/local-execution/include/local-execution/task_signature_impl.struct.toml deleted file mode 100644 index 78064203ec..0000000000 --- a/lib/local-execution/include/local-execution/task_signature_impl.struct.toml +++ /dev/null @@ -1,20 +0,0 @@ -namespace = "FlexFlow" -name = "TaskSignatureAndImpl" -features = [ - "eq", - "fmt", - "hash" -] - -includes = [ - "local-execution/task_impl_function.dtg.h", - "task-spec/op_task_signature.h", -] - -[[fields]] -name = "impl_function" -type = "::FlexFlow::TaskImplFunction" - -[[fields]] -name = "task_signature" -type = "::FlexFlow::OpTaskSignature" diff --git a/lib/local-execution/include/local-execution/tracked_allocator.h b/lib/local-execution/include/local-execution/tracked_allocator.h index 731e04fdc8..f697337c52 100644 --- a/lib/local-execution/include/local-execution/tracked_allocator.h +++ b/lib/local-execution/include/local-execution/tracked_allocator.h @@ -13,6 +13,9 @@ struct TrackedAllocator : public IAllocator { void *allocate(size_t) override; void deallocate(void *) override; + + DeviceType get_allocation_device_type() const override; + size_t get_current_mem_usage(); private: diff --git a/lib/local-execution/src/allocated_tensors.cc b/lib/local-execution/src/allocated_tensors.cc index 196da16ace..ffaeaf285f 100644 --- a/lib/local-execution/src/allocated_tensors.cc +++ b/lib/local-execution/src/allocated_tensors.cc @@ -35,7 +35,8 @@ bool are_allocated_forward_tensors_valid( if (!is_allocated_tensor_backing_valid( TensorTypeVariant{tensor_guid}, allocated_tensors.tensor_type_backings, - ArrayShape{tensor_attrs.at(tensor_guid).shape})) { + array_shape_from_tensor_shape( + tensor_attrs.at(tensor_guid).shape))) { return false; } } else { @@ -58,8 +59,8 @@ bool are_allocated_gradient_tensors_valid( return false; } - ArrayShape tensor_guid_array_shape = - ArrayShape{tensor_attrs.at(tensor_to_grad.first).shape}; + ArrayShape tensor_guid_array_shape = array_shape_from_tensor_shape( + tensor_attrs.at(tensor_to_grad.first).shape); TensorTypeVariant gradient_tensor = TensorTypeVariant{tensor_to_grad.second}; if (is_allocated_tensor_backing_valid( @@ -100,8 +101,8 @@ bool are_allocated_optimizer_tensors_valid( return false; } - ArrayShape tensor_guid_array_shape = - ArrayShape{tensor_attrs.at(tensor_to_optimizers.first).shape}; + ArrayShape tensor_guid_array_shape = array_shape_from_tensor_shape( + tensor_attrs.at(tensor_to_optimizers.first).shape); for (optimizer_tensor_t const &optimizer_tensor : tensor_to_optimizers.second) { if (is_allocated_tensor_backing_valid( diff --git a/lib/local-execution/src/fwd_bwd_op_task_impl_function.cc b/lib/local-execution/src/fwd_bwd_op_task_impl_function.cc deleted file mode 100644 index 308dbfd3ae..0000000000 --- a/lib/local-execution/src/fwd_bwd_op_task_impl_function.cc +++ /dev/null @@ -1,54 +0,0 @@ -#include "local-execution/fwd_bwd_op_task_impl_function.h" - -namespace FlexFlow { - -bool FwdBwdOpTaskImplFunction::operator==( - FwdBwdOpTaskImplFunction const &other) const { - return this->function_ptr == other.function_ptr; -} - -bool FwdBwdOpTaskImplFunction::operator!=( - FwdBwdOpTaskImplFunction const &other) const { - return this->function_ptr != other.function_ptr; -} - -bool FwdBwdOpTaskImplFunction::operator<( - FwdBwdOpTaskImplFunction const &other) const { - return this->function_ptr < other.function_ptr; -} - -bool FwdBwdOpTaskImplFunction::operator>( - FwdBwdOpTaskImplFunction const &other) const { - return this->function_ptr > other.function_ptr; -} - -bool FwdBwdOpTaskImplFunction::operator<=( - FwdBwdOpTaskImplFunction const &other) const { - return this->function_ptr <= other.function_ptr; -} - -bool FwdBwdOpTaskImplFunction::operator>=( - FwdBwdOpTaskImplFunction const &other) const { - return this->function_ptr >= other.function_ptr; -} - -std::string format_as(FwdBwdOpTaskImplFunction const &x) { - std::ostringstream oss; - oss << ""; - return oss.str(); -} - -std::ostream &operator<<(std::ostream &s, FwdBwdOpTaskImplFunction const &x) { - return s << fmt::to_string(x); -} - -} // namespace FlexFlow - -namespace std { -size_t hash::operator()( - ::FlexFlow::FwdBwdOpTaskImplFunction const &x) const { - return std::hash{}(x.function_ptr); -} -} // namespace std diff --git a/lib/local-execution/src/generic_task_impl_function.cc b/lib/local-execution/src/generic_task_impl_function.cc deleted file mode 100644 index 87d4db53e6..0000000000 --- a/lib/local-execution/src/generic_task_impl_function.cc +++ /dev/null @@ -1,53 +0,0 @@ -#include "local-execution/generic_task_impl_function.h" - -namespace FlexFlow { - -bool GenericTaskImplFunction::operator==( - GenericTaskImplFunction const &other) const { - return this->function_ptr == other.function_ptr; -} - -bool GenericTaskImplFunction::operator!=( - GenericTaskImplFunction const &other) const { - return this->function_ptr != other.function_ptr; -} - -bool GenericTaskImplFunction::operator<( - GenericTaskImplFunction const &other) const { - return this->function_ptr < other.function_ptr; -} - -bool GenericTaskImplFunction::operator>( - GenericTaskImplFunction const &other) const { - return this->function_ptr > other.function_ptr; -} - -bool GenericTaskImplFunction::operator<=( - GenericTaskImplFunction const &other) const { - return this->function_ptr <= other.function_ptr; -} - -bool GenericTaskImplFunction::operator>=( - GenericTaskImplFunction const &other) const { - return this->function_ptr >= other.function_ptr; -} - -std::string format_as(GenericTaskImplFunction const &x) { - std::ostringstream oss; - oss << ""; - return oss.str(); -} -std::ostream &operator<<(std::ostream &s, GenericTaskImplFunction const &x) { - return s << fmt::to_string(x); -} - -} // namespace FlexFlow - -namespace std { -size_t hash::operator()( - ::FlexFlow::GenericTaskImplFunction const &x) const { - return std::hash{}(x.function_ptr); -} -} // namespace std diff --git a/lib/local-execution/src/init_op_task_impl_function.cc b/lib/local-execution/src/init_op_task_impl_function.cc deleted file mode 100644 index abe84b828e..0000000000 --- a/lib/local-execution/src/init_op_task_impl_function.cc +++ /dev/null @@ -1,53 +0,0 @@ -#include "local-execution/init_op_task_impl_function.h" - -namespace FlexFlow { - -bool InitOpTaskImplFunction::operator==( - InitOpTaskImplFunction const &other) const { - return this->function_ptr == other.function_ptr; -} - -bool InitOpTaskImplFunction::operator!=( - InitOpTaskImplFunction const &other) const { - return this->function_ptr != other.function_ptr; -} - -bool InitOpTaskImplFunction::operator<( - InitOpTaskImplFunction const &other) const { - return this->function_ptr < other.function_ptr; -} - -bool InitOpTaskImplFunction::operator>( - InitOpTaskImplFunction const &other) const { - return this->function_ptr > other.function_ptr; -} - -bool InitOpTaskImplFunction::operator<=( - InitOpTaskImplFunction const &other) const { - return this->function_ptr <= other.function_ptr; -} - -bool InitOpTaskImplFunction::operator>=( - InitOpTaskImplFunction const &other) const { - return this->function_ptr >= other.function_ptr; -} - -std::string format_as(InitOpTaskImplFunction const &x) { - std::ostringstream oss; - oss << ""; - return oss.str(); -} -std::ostream &operator<<(std::ostream &s, InitOpTaskImplFunction const &x) { - return s << fmt::to_string(x); -} - -} // namespace FlexFlow - -namespace std { -size_t hash::operator()( - ::FlexFlow::InitOpTaskImplFunction const &x) const { - return std::hash{}(x.function_ptr); -} -} // namespace std diff --git a/lib/local-execution/src/local-execution/ops/attention.cc b/lib/local-execution/src/local-execution/ops/attention.cc deleted file mode 100644 index a9e6a9fa30..0000000000 --- a/lib/local-execution/src/local-execution/ops/attention.cc +++ /dev/null @@ -1,259 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "local-execution/ops/attention.h" -#include "kernels/attention_kernels.h" -#include "op-attrs/ops/attention.h" -#include "op-attrs/ops/attention/multihead_attention_parallel_inputs.h" -#include "task-spec/op_task_signature.h" - -namespace FlexFlow { - -using namespace FlexFlow::Kernels::MultiHeadAttention; - -enum Slots { - QUERY_PARALLEL_TENSOR_SHAPE, - KEY_PARALLEL_TENSOR_SHAPE, - VALUE_PARALLEL_TENSOR_SHAPE, - QPROJSIZE, - KPROJSIZE, - VPROJSIZE, - OPROJSIZE, - ATTRS, - PROFILING, - QUERY, - KEY, - VALUE, - WEIGHTS, - OUTPUT, - HANDLE, - PER_DEVICE_STATE -}; - -OpTaskInvocation init(MultiHeadAttentionAttrs const &attrs) { - OpTaskBinding b; - - b.bind_arg(HANDLE, ff_handle()); - b.bind_arg(ATTRS, attrs); - - b.bind_arg(QUERY_PARALLEL_TENSOR_SHAPE, input_parallel_tensor_shape(0)); - b.bind_arg(KEY_PARALLEL_TENSOR_SHAPE, input_parallel_tensor_shape(1)); - b.bind_arg(VALUE_PARALLEL_TENSOR_SHAPE, input_parallel_tensor_shape(2)); - - b.bind_arg(QPROJSIZE, get_qProjSize(attrs)); - b.bind_arg(KPROJSIZE, get_kProjSize(attrs)); - b.bind_arg(VPROJSIZE, get_vProjSize(attrs)); - b.bind_arg(OPROJSIZE, get_oProjSize(attrs)); - - return {task_id_t::ATTENTION_INIT_TASK_ID, b}; -} - -OpTaskInvocation forward(MultiHeadAttentionAttrs const &attrs) { - OpTaskBinding b; - - b.bind(QUERY, input_tensor(0)); - b.bind(KEY, input_tensor(1)); - b.bind(VALUE, input_tensor(2)); - b.bind(WEIGHTS, weight_tensor(0)); - b.bind(OUTPUT, output_tensor(0)); - - b.bind_arg(PROFILING, profiling_settings()); - b.bind_arg(PER_DEVICE_STATE, per_device_op_state()); - - return {task_id_t::ATTENTION_FWD_TASK_ID, b}; -} - -OpTaskInvocation backward(MultiHeadAttentionAttrs const &attrs) { - OpTaskBinding b = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::ATTENTION_BWD_TASK_ID, b}; -} - -static DeviceSpecificDeviceStates - init_task_impl(TaskArgumentAccessor const &acc) { - auto const &attrs = acc.get_argument(ATTRS); - Allocator allocator = acc.get_allocator(); - nonnegative_int qProjSize = acc.get_argument(QPROJSIZE); - nonnegative_int kProjSize = acc.get_argument(KPROJSIZE); - nonnegative_int vProjSize = acc.get_argument(VPROJSIZE); - nonnegative_int oProjSize = acc.get_argument(OPROJSIZE); - - PerDeviceFFHandle handle = acc.get_argument(HANDLE); - ParallelTensorShape query_parallel_tensor_shape = - acc.get_argument(QUERY_PARALLEL_TENSOR_SHAPE); - ParallelTensorShape key_parallel_tensor_shape = - acc.get_argument(KEY_PARALLEL_TENSOR_SHAPE); - ParallelTensorShape value_parallel_tensor_shape = - acc.get_argument(VALUE_PARALLEL_TENSOR_SHAPE); - - MultiHeadAttentionParallelInputs parsed = throw_if_unexpected( - parse_attention_parallel_input_shape(query_parallel_tensor_shape, - key_parallel_tensor_shape, - value_parallel_tensor_shape)); - ParallelTensorShape weight_parallel_tensor_shape = - throw_if_unexpected(get_weights_shape(attrs, - query_parallel_tensor_shape, - key_parallel_tensor_shape, - value_parallel_tensor_shape)); - - nonnegative_int kvSeqLength = get_kvSeqLength(parsed); - nonnegative_int qSize = get_qSize(parsed); - nonnegative_int kSize = get_kSize(parsed); - nonnegative_int vSize = get_vSize(parsed); - - nonnegative_int qoSeqLength = get_qoSeqLength(parsed); - nonnegative_int num_samples = get_num_samples(parsed); - nonnegative_int num_heads = attrs.num_heads; - - MHAPerDeviceState per_device_state = - init_kernel(handle, - allocator, - num_samples.unwrap_nonnegative(), - num_heads.unwrap_nonnegative(), - qSize.unwrap_nonnegative(), - kSize.unwrap_nonnegative(), - vSize.unwrap_nonnegative(), - qProjSize.unwrap_nonnegative(), - kProjSize.unwrap_nonnegative(), - vProjSize.unwrap_nonnegative(), - oProjSize.unwrap_nonnegative(), - qoSeqLength.unwrap_nonnegative(), - kvSeqLength.unwrap_nonnegative(), - attrs.add_bias_kv); - return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - auto query = acc.get_tensor(QUERY); - auto key = acc.get_tensor(KEY); - auto value = acc.get_tensor(VALUE); - auto weight = acc.get_tensor(WEIGHTS); - auto output = acc.get_tensor(OUTPUT); - - ProfilingSettings profiling = acc.get_argument(PROFILING); - MHAPerDeviceState per_device_state = - acc.get_argument(PER_DEVICE_STATE); - - return profile(forward_kernel, - profiling, - "[MultiHeadAttention] forward_time = {:.2lf}ms\n", - per_device_state, - query.get_float_ptr(), - key.get_float_ptr(), - value.get_float_ptr(), - weight.get_float_ptr(), - output.get_float_ptr()); -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - auto query = acc.get_tensor(QUERY); - auto key = acc.get_tensor(KEY); - auto value = acc.get_tensor(VALUE); - auto weight = acc.get_tensor(WEIGHTS); - - auto output_grad = acc.get_tensor_grad(OUTPUT); - auto weight_grad = acc.get_tensor_grad(WEIGHTS); - auto query_grad = acc.get_tensor_grad(QUERY); - auto key_grad = acc.get_tensor_grad(KEY); - auto value_grad = acc.get_tensor_grad(VALUE); - - MHAPerDeviceState per_device_state = - acc.get_argument(PER_DEVICE_STATE); - ProfilingSettings profiling = acc.get_argument(PROFILING); - - float *key_grad_ptr = - (key_grad == query_grad) ? nullptr : key_grad.get_float_ptr(); - float *value_grad_ptr = (value_grad == query_grad || value_grad == key_grad) - ? nullptr - : value_grad.get_float_ptr(); - - assert(value_grad.shape == value.shape); - assert(key_grad.shape == key.shape); - - assert(query_grad.shape == query.shape); - assert(weight_grad.shape.get_volume() == weight.shape.get_volume()); - - return profile(backward_kernel, - profiling, - "[MultiHeadAttention] backward_time = {:.2lf}ms\n", - per_device_state, - query.get_float_ptr(), - query_grad.get_float_ptr(), - key.get_float_ptr(), - key_grad_ptr, - value.get_float_ptr(), - value_grad_ptr, - weight.get_float_ptr(), - weight_grad.get_float_ptr(), - output_grad.get_float_ptr()); -} - -TaskImplFunction get_attention_init_task_impl() { - return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; -} -TaskImplFunction get_attention_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} -TaskImplFunction get_attention_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -OpTaskSignature get_attention_init_signature() { - OpTaskSignature init(OpTaskType::INIT); - init.add_arg_slot(QUERY_PARALLEL_TENSOR_SHAPE); - init.add_arg_slot(KEY_PARALLEL_TENSOR_SHAPE); - init.add_arg_slot(VALUE_PARALLEL_TENSOR_SHAPE); - init.add_arg_slot(QPROJSIZE); - init.add_arg_slot(KPROJSIZE); - init.add_arg_slot(VPROJSIZE); - init.add_arg_slot(OPROJSIZE); - init.add_arg_slot(ATTRS); - init.add_unchecked_arg_slot(HANDLE); - - init.add_return_value(); - - return init; -} - -OpTaskSignature get_attention_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_input_slot(QUERY); - fwd.add_input_slot(KEY); - fwd.add_input_slot(VALUE); - fwd.add_weight_slot(WEIGHTS); - fwd.add_output_slot(OUTPUT); - - fwd.add_arg_slot(PROFILING); - fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); - - return fwd; -} - -OpTaskSignature get_attention_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_attention_fwd_signature()); - - return bwd; -} - -std::vector get_task_ids(MultiHeadAttentionAttrs const &) { - return {task_id_t::ATTENTION_INIT_TASK_ID, - task_id_t::ATTENTION_FWD_TASK_ID, - task_id_t::ATTENTION_BWD_TASK_ID}; -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/batch_matmul.cc b/lib/local-execution/src/local-execution/ops/batch_matmul.cc deleted file mode 100644 index 2cbf1cf20f..0000000000 --- a/lib/local-execution/src/local-execution/ops/batch_matmul.cc +++ /dev/null @@ -1,194 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "local-execution/ops/batch_matmul.h" -#include "kernels/batch_matmul_kernels.h" -#include "op-attrs/ops/batch_matmul.h" -#include "task-spec/op_task_signature.h" -#include "utils/containers/transform.h" -#include "utils/nonnegative_int/nonnegative_range.h" - -namespace FlexFlow { - -using namespace FlexFlow::Kernels::BatchMatmul; - -enum Slots { - A_INPUT, // tensor - B_INPUT, // tensor - ATTRS, - OUTPUT, // tensor - PROFILING, - HANDLE, - ITERATION_CONFIG -}; - -OpTaskInvocation forward(BatchMatmulAttrs const &attrs) { - OpTaskBinding fwd; - - fwd.bind(A_INPUT, input_tensor(0)); - fwd.bind(B_INPUT, input_tensor(1)); - fwd.bind(OUTPUT, output_tensor(0)); - - fwd.bind_arg(ATTRS, attrs); - fwd.bind_arg(HANDLE, ff_handle()); - fwd.bind_arg(PROFILING, profiling_settings()); - fwd.bind_arg(ITERATION_CONFIG, iteration_config()); - - return {task_id_t::BATCHMATMUL_FWD_TASK_ID, fwd}; -} - -OpTaskInvocation backward(BatchMatmulAttrs const &attrs) { - OpTaskBinding bwd = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::BATCHMATMUL_BWD_TASK_ID, bwd}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - auto a_input = acc.get_tensor(A_INPUT); - auto b_input = acc.get_tensor(B_INPUT); - auto output = acc.get_tensor(OUTPUT); - auto attrs = acc.get_argument(ATTRS); - PerDeviceFFHandle handle = acc.get_argument(HANDLE); - - ProfilingSettings profiling = acc.get_argument(PROFILING); - FFIterationConfig iter_config = - acc.get_argument(ITERATION_CONFIG); - - nonnegative_int m = b_input.shape.at(legion_dim_t{0_n}); - assert(m == output.shape.at(legion_dim_t{0_n})); - nonnegative_int n = a_input.shape.at(legion_dim_t{1_n}); - assert(n == output.shape.at(legion_dim_t{1_n})); - nonnegative_int k = a_input.shape.at(legion_dim_t{0_n}); - assert(k == b_input.shape.at(legion_dim_t{1_n})); - - assert(a_input.shape.get_volume() == b_input.shape.get_volume()); - assert(a_input.shape.get_volume() == output.shape.get_volume()); - - nonnegative_int batch = 1_n; - for (nonnegative_int i : nonnegative_range(2_n, a_input.shape.get_dim())) { - nonnegative_int dim_size = a_input.shape.at(legion_dim_t{i}); - assert(dim_size == b_input.shape.at(legion_dim_t{i})); - assert(dim_size == output.shape.at(legion_dim_t{i})); - batch *= dim_size; - } - - auto get_raw_seq_len = [](std::optional seq_len) -> int { - return transform(seq_len, - [](nonnegative_int x) { return x.unwrap_nonnegative(); }) - .value_or(-1); - }; - - return profile(forward_kernel, - profiling, - "[BatchMatmul] forward_time = {:.2lf}ms\n", - handle, - output.get_float_ptr(), - a_input.get_float_ptr(), - b_input.get_float_ptr(), - m.unwrap_nonnegative(), - n.unwrap_nonnegative(), - k.unwrap_nonnegative(), - batch.unwrap_nonnegative(), - get_raw_seq_len(attrs.a_seq_length_dim), - get_raw_seq_len(attrs.b_seq_length_dim), - iter_config.seq_length); -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - // BatchMatmul* bmm = (BatchMatmul*) task->args; - FFIterationConfig iter_config = - acc.get_argument(ITERATION_CONFIG); - ProfilingSettings profiling = acc.get_argument(PROFILING); - PerDeviceFFHandle handle = acc.get_argument(HANDLE); - - auto output = acc.get_tensor(OUTPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); - assert(output.shape == output_grad.shape); - - auto a_input = acc.get_tensor(A_INPUT); - auto a_input_grad = acc.get_tensor_grad(A_INPUT); - assert(a_input.shape == a_input_grad.shape); - - auto b_input = acc.get_tensor(B_INPUT); - auto b_input_grad = acc.get_tensor_grad(B_INPUT); - assert(b_input.shape == b_input_grad.shape); - - // check dins - nonnegative_int m = b_input.shape.at(legion_dim_t{0_n}); - assert(m == output.shape.at(legion_dim_t{0_n})); - nonnegative_int n = a_input.shape.at(legion_dim_t{1_n}); - assert(n == output.shape.at(legion_dim_t{1_n})); - nonnegative_int k = a_input.shape.at(legion_dim_t{0_n}); - assert(k == b_input.shape.at(legion_dim_t{1_n})); - assert(a_input.shape.get_volume() == b_input.shape.get_volume()); - assert(a_input.shape.get_volume() == output.shape.get_volume()); - - nonnegative_int batch = 1_n; - for (nonnegative_int i : nonnegative_range(2_n, a_input.shape.get_dim())) { - nonnegative_int dim_size = a_input.shape.at(legion_dim_t{i}); - assert(dim_size == b_input.shape.at(legion_dim_t{i})); - assert(dim_size == output.shape.at(legion_dim_t{i})); - batch *= dim_size; - } - - return profile(backward_kernel, - profiling, - "[BatchMatmul] backward_time = {:.2lf}ms\n", - handle, - output.get_float_ptr(), - output_grad.get_float_ptr(), - a_input.get_float_ptr(), - a_input_grad.get_float_ptr(), - b_input.get_float_ptr(), - b_input_grad.get_float_ptr(), - m.unwrap_nonnegative(), - n.unwrap_nonnegative(), - k.unwrap_nonnegative(), - batch.unwrap_nonnegative()); -} - -TaskImplFunction get_batch_matmul_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} -TaskImplFunction get_batch_matmul_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -OpTaskSignature get_batch_matmul_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_input_slot(A_INPUT); - fwd.add_input_slot(B_INPUT); - fwd.add_output_slot(OUTPUT); - fwd.add_arg_slot(ATTRS); - fwd.add_arg_slot(PROFILING); - fwd.add_unchecked_arg_slot(HANDLE); - - return fwd; -} - -OpTaskSignature get_batch_matmul_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_batch_matmul_fwd_signature()); - - return bwd; -} - -std::vector get_task_ids(BatchMatmulAttrs const &) { - return {task_id_t::BATCHMATMUL_FWD_TASK_ID, - task_id_t::BATCHMATMUL_BWD_TASK_ID}; -} - -}; // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/batch_norm.cc b/lib/local-execution/src/local-execution/ops/batch_norm.cc deleted file mode 100644 index 97dcb6e103..0000000000 --- a/lib/local-execution/src/local-execution/ops/batch_norm.cc +++ /dev/null @@ -1,196 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "local-execution/ops/batch_norm.h" -#include "kernels/batch_norm_kernels.h" - -namespace FlexFlow { - -using namespace FlexFlow::Kernels::BatchNorm; - -enum Slots { - INPUT, // tensor - SCALE, // tensor - BIAS, // tensor - OUTPUT, // tensor - ATTRS, - PROFILING, - PER_DEVICE_STATE, - RELU, - HANDLE -}; - -OpTaskInvocation init(BatchNormAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind(INPUT, input_tensor(0)); - binding.bind(BIAS, input_tensor(2)); - binding.bind(OUTPUT, output_tensor(0)); - - binding.bind_arg(ATTRS, attrs); - binding.bind_arg(PROFILING, profiling_settings()); - binding.bind_arg(HANDLE, ff_handle()); - - return {task_id_t::BATCHNORM_INIT_TASK_ID, binding}; -} - -OpTaskInvocation forward(BatchNormAttrs const &attrs) { - OpTaskBinding binding; - binding.bind_arg(PROFILING, profiling_settings()); - binding.bind_arg(PER_DEVICE_STATE, - per_device_op_state()); - - binding.bind(INPUT, input_tensor(0)); - binding.bind(SCALE, input_tensor(1)); - binding.bind(BIAS, input_tensor(2)); - binding.bind(OUTPUT, output_tensor(0)); - - return {task_id_t::BATCHNORM_FWD_TASK_ID, binding}; -} - -OpTaskInvocation backward(BatchNormAttrs const &attrs) { - OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::BATCHNORM_BWD_TASK_ID, binding}; -} - -static DeviceSpecificDeviceStates - init_task_impl(TaskArgumentAccessor const &acc) { - Allocator allocator = acc.get_allocator(); - PerDeviceFFHandle handle = acc.get_argument(HANDLE); - ProfilingSettings profiling = acc.get_argument(PROFILING); - - auto output = acc.get_tensor(OUTPUT); - auto const &attrs = acc.get_argument(ATTRS); - - nonnegative_int output_w = output.shape.at(legion_dim_t{0_n}); - nonnegative_int output_h = output.shape.at(legion_dim_t{1_n}); - nonnegative_int output_c = output.shape.at(legion_dim_t{2_n}); - nonnegative_int output_n = output.shape.at(legion_dim_t{3_n}); - - float *runningMean; - - BatchNormPerDeviceState per_device_state = - init_kernel(handle, - allocator, - runningMean, - output_n.unwrap_nonnegative(), - output_c.unwrap_nonnegative(), - output_h.unwrap_nonnegative(), - output_w.unwrap_nonnegative(), - attrs.relu); - - return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); - ProfilingSettings profiling = acc.get_argument(PROFILING); - - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - auto scale = acc.get_tensor(SCALE); - auto bias = acc.get_tensor(SCALE); - - return profile(forward_kernel, - profiling, - "[BatchNorm] forward_time = {:.2lf}ms\n", - per_device_state, - input.get_float_ptr(), - output.get_float_ptr(), - scale.get_float_ptr(), - bias.get_float_ptr()); -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); - ProfilingSettings profiling = acc.get_argument(PROFILING); - - auto input = acc.get_tensor(INPUT); - auto input_grad = acc.get_tensor_grad(INPUT); - auto output = acc.get_tensor(OUTPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); - auto scale = acc.get_tensor(SCALE); - auto scale_grad = acc.get_tensor_grad(SCALE); - auto bias_grad = acc.get_tensor_grad(BIAS); - - return profile(backward_kernel, - profiling, - "[BatchNorm] backward_time = {:.2lf}ms\n", - per_device_state, - input.get_float_ptr(), - output_grad.get_float_ptr(), - output.get_float_ptr(), - input_grad.get_float_ptr(), - scale.get_float_ptr(), - scale_grad.get_float_ptr(), - bias_grad.get_float_ptr(), - output.shape.get_volume().unwrap_nonnegative()); -} - -TaskImplFunction get_batch_norm_init_task_impl() { - return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; -} -TaskImplFunction get_batch_norm_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} -TaskImplFunction get_batch_norm_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -OpTaskSignature get_batch_norm_init_signature() { - OpTaskSignature init(OpTaskType::INIT); - - init.add_input_slot(INPUT); - init.add_input_slot(BIAS); - init.add_output_slot(OUTPUT); - init.add_arg_slot(ATTRS); - init.add_arg_slot(PROFILING); - init.add_unchecked_arg_slot(HANDLE); - - return init; -} - -OpTaskSignature get_batch_norm_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_input_slot(INPUT); - fwd.add_input_slot(SCALE); - fwd.add_input_slot(BIAS); - fwd.add_output_slot(OUTPUT); - fwd.add_arg_slot(PROFILING); - fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); - - return fwd; -} -OpTaskSignature get_batch_norm_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_batch_norm_fwd_signature()); - - return bwd; -} - -std::vector get_task_ids(BatchNormAttrs const &) { - return { - task_id_t::BATCHNORM_INIT_TASK_ID, - task_id_t::BATCHNORM_FWD_TASK_ID, - task_id_t::BATCHNORM_BWD_TASK_ID, - }; -} - -}; // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/cast.cc b/lib/local-execution/src/local-execution/ops/cast.cc deleted file mode 100644 index e5dd7f9c4e..0000000000 --- a/lib/local-execution/src/local-execution/ops/cast.cc +++ /dev/null @@ -1,110 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "local-execution/ops/cast.h" -#include "kernels/cast_kernels.h" - -#include "task-spec/op_task_signature.h" -#include "utils/hash-utils.h" - -using namespace FlexFlow::Kernels::Cast; - -namespace FlexFlow { - -enum Slots { INPUT, OUTPUT, ATTRS, PROFILING }; - -OpTaskInvocation forward(CastAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind_arg(PROFILING, profiling_settings()); - binding.bind_arg(ATTRS, attrs); - - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - - return {task_id_t::CAST_FWD_TASK_ID, binding}; -} - -OpTaskInvocation backward(CastAttrs const &attrs) { - OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::CAST_BWD_TASK_ID, binding}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto const &attrs = acc.get_argument(ATTRS); - - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - - return profile(forward_kernel, - profiling, - "[Cast] forward_time = {:.2lf}ms\n", - input, - output, - input.data_type, - attrs.dtype); -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto const &attrs = acc.get_argument(ATTRS); - - auto input = acc.get_tensor(INPUT); - - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); - - return profile(backward_kernel, - profiling, - "[Cast] forward_time = {:.2lf}ms\n", - input_grad, - output_grad, - input.data_type, - attrs.dtype); -} - -TaskImplFunction get_cast_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} -TaskImplFunction get_cast_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -OpTaskSignature get_cast_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_arg_slot(ATTRS); - fwd.add_arg_slot(PROFILING); - - fwd.add_input_slot(INPUT); - fwd.add_output_slot(OUTPUT); - - return fwd; -} - -OpTaskSignature get_cast_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_cast_fwd_signature()); - - return bwd; -} - -std::vector get_task_ids(CastAttrs const &) { - return {task_id_t::CAST_FWD_TASK_ID, task_id_t::CAST_BWD_TASK_ID}; -} - -}; // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/combine.cc b/lib/local-execution/src/local-execution/ops/combine.cc deleted file mode 100644 index 32fab636d3..0000000000 --- a/lib/local-execution/src/local-execution/ops/combine.cc +++ /dev/null @@ -1,94 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "local-execution/ops/combine.h" -#include "kernels/combine_kernels.h" -#include "task-spec/op_task_invocation.h" -#include "utils/hash-utils.h" - -namespace FlexFlow { - -using namespace FlexFlow::Kernels::Combine; - -enum Slots { INPUT, OUTPUT, PROFILING }; - -OpTaskInvocation forward(CombineAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind_arg(PROFILING, profiling_settings()); - - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - - return {task_id_t::COMBINE_FWD_TASK_ID, binding}; -} - -OpTaskInvocation backward(CombineAttrs const &attrs) { - OpTaskBinding b = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::COMBINE_BWD_TASK_ID, b}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - - return profile(forward_kernel, - profiling, - "[Combine] forward_time = {:.2lf}ms\n", - input, - output); -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); - - return profile(backward_kernel, - profiling, - "[Combine] backward_time = {:.2lf}ms\n", - input_grad, - output_grad); -} - -OpTaskSignature get_combine_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_arg_slot(PROFILING); - fwd.add_input_slot(INPUT); - fwd.add_output_slot(OUTPUT); - - return fwd; -} - -OpTaskSignature get_combine_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_combine_fwd_signature()); - - return bwd; -} - -TaskImplFunction get_combine_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} -TaskImplFunction get_combine_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -}; // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/concat.cc b/lib/local-execution/src/local-execution/ops/concat.cc deleted file mode 100644 index 8531bf77c0..0000000000 --- a/lib/local-execution/src/local-execution/ops/concat.cc +++ /dev/null @@ -1,107 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "local-execution/ops/concat.h" -#include "kernels/concat_kernels.h" -#include "task-spec/op_task_signature.h" -#include "task-spec/variadic_tensor_ref.h" -#include "utils/hash-utils.h" - -namespace FlexFlow { - -using namespace FlexFlow::Kernels::Concat; - -enum Slots { INPUTS, OUTPUT, ATTRS, PROFILING, HANDLE, NUM_INPUTS }; - -OpTaskInvocation forward(ConcatAttrs const &attrs) { - OpTaskBinding binding; - binding.bind(INPUTS, get_input_tensors()); - binding.bind(OUTPUT, output_tensor(0)); - binding.bind_arg(PROFILING, profiling_settings()); - binding.bind_arg(ATTRS, attrs); - - return {task_id_t::CONCAT_FWD_TASK_ID, binding}; -} - -OpTaskInvocation backward(ConcatAttrs const &attrs) { - OpTaskBinding b = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::CONCAT_BWD_TASK_ID, b}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto const &attrs = acc.get_argument(ATTRS); - - auto output = acc.get_tensor(OUTPUT); - auto inputs = acc.get_variadic_tensor(INPUTS); - - assert(inputs.size() <= MAX_NUM_INPUTS); - - return profile(forward_kernel, - profiling, - "[Concat] forward_time = {:.2lf}ms\n", - output, - inputs, - attrs.axis); -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto const &attrs = acc.get_argument(ATTRS); - - auto input_grads = acc.get_variadic_tensor_grad(INPUTS); - auto output_grad = acc.get_tensor_grad(OUTPUT); - - assert(input_grads.size() <= MAX_NUM_INPUTS); - - return profile(backward_kernel, - profiling, - "[Concat] backward_time = {:.2lf}ms\n", - output_grad, - input_grads, - attrs.axis); -} - -TaskImplFunction get_concat_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} -TaskImplFunction get_concat_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -OpTaskSignature get_concat_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_arg_slot(ATTRS); - fwd.add_arg_slot(PROFILING); - fwd.add_input_slot(INPUTS, SlotType::VARIADIC); - fwd.add_output_slot(OUTPUT); - - return fwd; -} - -OpTaskSignature get_concat_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_concat_fwd_signature()); - - return bwd; -} - -std::vector get_task_ids(ConcatAttrs const &) { - return {task_id_t::CONCAT_FWD_TASK_ID, task_id_t::CONCAT_BWD_TASK_ID}; -} - -}; // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/conv_2d.cc b/lib/local-execution/src/local-execution/ops/conv_2d.cc deleted file mode 100644 index 49dbc4b4b1..0000000000 --- a/lib/local-execution/src/local-execution/ops/conv_2d.cc +++ /dev/null @@ -1,184 +0,0 @@ -#include "local-execution/ops/conv_2d.h" -#include "kernels/conv_2d_kernels.h" - -namespace FlexFlow { - -using namespace FlexFlow::Kernels::Conv2D; - -enum Slots { - INPUT, - OUTPUT, - FILTER, - BIAS, - ATTRS, - PROFILING, - PER_DEVICE_STATE, - HANDLE -}; - -OpTaskInvocation init(Conv2DAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - binding.bind(FILTER, weight_tensor(0)); - binding.bind_arg(ATTRS, attrs); - binding.bind_arg(HANDLE, ff_handle()); - - return {task_id_t::CONV2D_INIT_TASK_ID, binding}; -} - -OpTaskInvocation forward(Conv2DAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind_arg(ATTRS, attrs); - binding.bind_arg(PROFILING, profiling_settings()); - binding.bind_arg(PER_DEVICE_STATE, - per_device_op_state()); - - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - binding.bind(FILTER, weight_tensor(0)); - binding.bind(BIAS, weight_tensor(1)); - - return {task_id_t::CONV2D_FWD_TASK_ID, binding}; -} - -OpTaskInvocation backward(Conv2DAttrs const &attrs) { - OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::CONV2D_BWD_TASK_ID, binding}; -} - -static DeviceSpecificDeviceStates - init_task_impl(TaskArgumentAccessor const &acc) { - - PerDeviceFFHandle handle = acc.get_argument(HANDLE); - auto attrs = acc.get_argument(ATTRS); - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - auto filter = acc.get_tensor(FILTER); - auto filter_grad = acc.get_tensor_grad(FILTER); - - Conv2DPerDeviceState per_device_state = - init_kernel(/*handle=*/handle, - /*activation=*/attrs.activation, - /*kernel_h=*/attrs.kernel_h.unwrap_nonnegative(), - /*kernel_w=*/attrs.kernel_w.unwrap_nonnegative(), - /*groups=*/attrs.groups.unwrap_nonnegative(), - /*padding_h=*/attrs.padding_h.unwrap_nonnegative(), - /*padding_w=*/attrs.padding_w.unwrap_nonnegative(), - /*stride_h=*/attrs.stride_h.unwrap_nonnegative(), - /*stride_w=*/attrs.stride_w.unwrap_nonnegative(), - /*input=*/input, - /*output=*/output, - /*filter_ptr=*/filter.get_float_ptr(), - /*filter_grad_ptr=*/filter_grad.get_float_ptr()); - return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); - auto attrs = acc.get_argument(ATTRS); - - auto input = acc.get_tensor(INPUT); - auto filter = acc.get_tensor(FILTER); - auto bias = acc.get_tensor(BIAS); - auto output = acc.get_tensor(OUTPUT); - - return profile(forward_kernel, - profiling, - "[Conv2d] forward_time = {:.2lf}ms\n", - per_device_state, - input.get_float_ptr(), - output.get_float_ptr(), - filter.get_float_ptr(), - bias.get_float_ptr(), - attrs.activation); -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); - auto attrs = acc.get_argument(ATTRS); - - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - auto filter = acc.get_tensor(FILTER); - - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); - auto filter_grad = acc.get_tensor_grad(FILTER); - auto bias_grad = acc.get_tensor_grad(BIAS); - - return profile(backward_kernel, - profiling, - "[Conv2d] backward_time = {:.2lf}ms\n", - per_device_state, - input.get_float_ptr(), - input_grad.get_float_ptr(), - output.get_float_ptr(), - output_grad.get_float_ptr(), - filter.get_float_ptr(), - filter_grad.get_float_ptr(), - bias_grad.get_float_ptr(), - attrs.activation); -} - -TaskImplFunction get_conv_2d_init_task_impl() { - return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; -} -TaskImplFunction get_conv_2d_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} -TaskImplFunction get_conv_2d_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -OpTaskSignature get_conv_2d_init_signature() { - OpTaskSignature init(OpTaskType::INIT); - - init.add_input_slot(INPUT); - init.add_output_slot(OUTPUT); - init.add_weight_slot(FILTER); - init.add_arg_slot(ATTRS); - init.add_unchecked_arg_slot(HANDLE); - - init.add_return_value(); - - return init; -} - -OpTaskSignature get_conv_2d_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_arg_slot(PROFILING); - fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); - fwd.add_arg_slot(ATTRS); - - fwd.add_input_slot(INPUT); - fwd.add_output_slot(OUTPUT); - fwd.add_weight_slot(FILTER); - fwd.add_weight_slot(BIAS); - - return fwd; -} - -OpTaskSignature get_conv_2d_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_conv_2d_fwd_signature()); - - return bwd; -} - -std::vector get_task_ids(Conv2DAttrs const &) { - return {task_id_t::CONV2D_INIT_TASK_ID, - task_id_t::CONV2D_FWD_TASK_ID, - task_id_t::CONV2D_BWD_TASK_ID}; -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/dropout.cc b/lib/local-execution/src/local-execution/ops/dropout.cc deleted file mode 100644 index cc09841190..0000000000 --- a/lib/local-execution/src/local-execution/ops/dropout.cc +++ /dev/null @@ -1,134 +0,0 @@ -#include "local-execution/ops/dropout.h" -#include "kernels/dropout_kernels.h" -#include "task-spec/op_task_invocation.h" -#include "task-spec/op_task_signature.h" -#include "utils/hash-utils.h" - -namespace FlexFlow { - -using namespace FlexFlow::Kernels::Dropout; - -enum Slots { INPUT, OUTPUT, ATTRS, PER_DEVICE_STATE, FF_HANDLE, PROFILING }; - -OpTaskInvocation init(DropoutAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind_arg(ATTRS, attrs); - binding.bind_arg(FF_HANDLE, ff_handle()); - binding.bind(OUTPUT, output_tensor(0)); - - return {task_id_t::DROPOUT_INIT_TASK_ID, binding}; -} - -OpTaskInvocation forward(DropoutAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - - binding.bind_arg(PROFILING, profiling_settings()); - binding.bind_arg(PER_DEVICE_STATE, - per_device_op_state()); - - return {task_id_t::DROPOUT_FWD_TASK_ID, binding}; -} - -OpTaskInvocation backward(DropoutAttrs const &attrs) { - OpTaskBinding b = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::DROPOUT_BWD_TASK_ID, b}; -} - -static DeviceSpecificDeviceStates - init_task_impl(TaskArgumentAccessor const &acc) { - auto output = acc.get_tensor(OUTPUT); - Allocator allocator = acc.get_allocator(); - PerDeviceFFHandle handle = acc.get_argument(FF_HANDLE); - auto const &attrs = acc.get_argument(ATTRS); - - DropoutPerDeviceState per_device_state = - init_kernel(handle, attrs.rate, attrs.seed, output.shape, allocator); - return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - - return profile(forward_kernel, - profiling, - "[Dropout] forward_time = {:.2lf}ms\n", - per_device_state, - input.get_float_ptr(), - output.get_float_ptr()); -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - auto const &attrs = acc.get_argument(ATTRS); - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); - ProfilingSettings profiling = acc.get_argument(PROFILING); - - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); - - return profile(backward_kernel, - profiling, - "[Dropout] backward_time = {:.2lf}ms\n", - per_device_state, - output_grad.get_float_ptr(), - input_grad.get_float_ptr()); -} - -TaskImplFunction get_dropout_init_task_impl() { - return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; -} -TaskImplFunction get_dropout_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} -TaskImplFunction get_dropout_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -OpTaskSignature get_dropout_init_signature() { - OpTaskSignature init(OpTaskType::INIT); - - init.add_arg_slot(ATTRS); - init.add_unchecked_arg_slot(FF_HANDLE); - init.add_output_slot(OUTPUT); - - init.add_return_value(); - - return init; -} - -OpTaskSignature get_dropout_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); - fwd.add_arg_slot(PROFILING); - - fwd.add_input_slot(INPUT); - fwd.add_output_slot(OUTPUT); - - return fwd; -} - -OpTaskSignature get_dropout_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_dropout_fwd_signature()); - - return bwd; -} - -std::vector get_task_ids(DropoutAttrs const &) { - return {task_id_t::DROPOUT_INIT_TASK_ID, - task_id_t::DROPOUT_FWD_TASK_ID, - task_id_t::DROPOUT_BWD_TASK_ID}; -} - -}; // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/element_binary.cc b/lib/local-execution/src/local-execution/ops/element_binary.cc deleted file mode 100644 index ec8ed298d0..0000000000 --- a/lib/local-execution/src/local-execution/ops/element_binary.cc +++ /dev/null @@ -1,180 +0,0 @@ -#include "local-execution/ops/element_binary.h" -#include "kernels/element_binary_kernels.h" -#include "local-execution/task_signature_impl.h" -#include "utils/hash-utils.h" - -namespace FlexFlow { - -using namespace FlexFlow::Kernels::ElementBinary; - -enum Slots { - LHS_INPUT, - RHS_INPUT, - OUTPUT, - PROFILING, - PER_DEVICE_STATE, - HANDLE, - ATTRS -}; - -OpTaskInvocation init(ElementBinaryAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind(LHS_INPUT, input_tensor(0)); - binding.bind(RHS_INPUT, input_tensor(1)); - binding.bind(OUTPUT, output_tensor(0)); - binding.bind_arg(ATTRS, attrs); - binding.bind_arg(HANDLE, ff_handle()); - - return {task_id_t::ELEMENTBINARY_INIT_TASK_ID, binding}; -} - -OpTaskInvocation forward(ElementBinaryAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind(LHS_INPUT, input_tensor(0)); - binding.bind(RHS_INPUT, input_tensor(1)); - binding.bind(OUTPUT, output_tensor(0)); - binding.bind_arg(ATTRS, attrs); - binding.bind_arg(PROFILING, profiling_settings()); - binding.bind_arg(PER_DEVICE_STATE, - per_device_op_state()); - binding.bind_arg(HANDLE, ff_handle()); - - return {task_id_t::ELEMENTBINARY_FWD_TASK_ID, binding}; -} - -OpTaskInvocation backward(ElementBinaryAttrs const &attrs) { - OpTaskBinding b = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::ELEMENTBINARY_BWD_TASK_ID, b}; -} - -static DeviceSpecificDeviceStates - init_task_impl(TaskArgumentAccessor const &acc) { - auto input_lhs = acc.get_tensor(LHS_INPUT); - auto input_rhs = acc.get_tensor(RHS_INPUT); - auto output = acc.get_tensor(OUTPUT); - - PerDeviceFFHandle handle = acc.get_argument(HANDLE); - auto const &attrs = acc.get_argument(ATTRS); - - ElementBinaryPerDeviceState per_device_state = - init_kernel(handle, - attrs.type, - attrs.should_broadcast_lhs, - attrs.should_broadcast_rhs, - input_lhs.shape, - input_rhs.shape, - output.shape); - return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); - auto const &attrs = acc.get_argument(ATTRS); - - auto input_lhs = acc.get_tensor(LHS_INPUT); - auto input_rhs = acc.get_tensor(RHS_INPUT); - auto output = acc.get_tensor(OUTPUT); - PerDeviceFFHandle handle = acc.get_argument(HANDLE); - - return profile(forward_kernel, - profiling, - "[ElementBinary] forward_time = {:.2lf}ms\n", - per_device_state, - input_lhs.get_float_ptr(), - input_rhs.get_float_ptr(), - output.get_float_ptr(), - attrs.type, - attrs.should_broadcast_lhs, - handle); -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto const &attrs = acc.get_argument(ATTRS); - PerDeviceFFHandle handle = acc.get_argument(HANDLE); - - auto input_lhs = acc.get_tensor(LHS_INPUT); - auto input_rhs = acc.get_tensor(RHS_INPUT); - - auto output_grad = acc.get_tensor_grad(OUTPUT); - auto input_lhs_grad = acc.get_tensor_grad(LHS_INPUT); - auto input_rhs_grad = acc.get_tensor_grad(RHS_INPUT); - - return profile(backward_kernel, - profiling, - "[ElementBinary] backward_time = {:.2lf}ms\n", - per_device_state, - output_grad.get_float_ptr(), - input_lhs.get_float_ptr(), - input_rhs.get_float_ptr(), - input_lhs_grad.get_float_ptr(), - input_rhs_grad.get_float_ptr(), - attrs.type, - attrs.should_broadcast_lhs, - attrs.should_broadcast_rhs, - handle); -} - -TaskImplFunction get_element_binary_init_task_impl() { - return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; -} - -TaskImplFunction get_element_binary_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} - -TaskImplFunction get_element_binary_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -OpTaskSignature get_element_binary_init_signature() { - OpTaskSignature init(OpTaskType::INIT); - - init.add_input_slot(LHS_INPUT); - init.add_input_slot(RHS_INPUT); - init.add_output_slot(OUTPUT); - init.add_arg_slot(ATTRS); - init.add_unchecked_arg_slot(HANDLE); - - init.add_return_value(); - - return init; -} - -OpTaskSignature get_element_binary_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_arg_slot(PROFILING); - fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); - fwd.add_arg_slot(ATTRS); - fwd.add_unchecked_arg_slot(HANDLE); - - fwd.add_input_slot(LHS_INPUT); - fwd.add_input_slot(RHS_INPUT); - fwd.add_output_slot(OUTPUT); - - return fwd; -} - -OpTaskSignature get_element_binary_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_element_binary_fwd_signature()); - - return bwd; -} - -std::vector get_task_ids(ElementBinaryAttrs const &) { - return {task_id_t::ELEMENTBINARY_INIT_TASK_ID, - task_id_t::ELEMENTBINARY_FWD_TASK_ID, - task_id_t::ELEMENTBINARY_BWD_TASK_ID}; -} - -}; // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/element_unary.cc b/lib/local-execution/src/local-execution/ops/element_unary.cc deleted file mode 100644 index 106c0760cd..0000000000 --- a/lib/local-execution/src/local-execution/ops/element_unary.cc +++ /dev/null @@ -1,165 +0,0 @@ -#include "local-execution/ops/element_unary.h" -#include "kernels/element_unary_kernels.h" -#include "op-attrs/parallel_tensor_shape.h" -#include "utils/hash-utils.h" - -namespace FlexFlow { - -// declare Legion names - -using namespace FlexFlow::Kernels::ElementUnary; - -enum Slots { - INPUT, - INPUT_SHAPE, - OUTPUT, - ATTRS, - HANDLE, - PROFILING, - PER_DEVICE_STATE -}; - -/* ElementUnary */ -OpTaskInvocation init(ElementUnaryAttrs const &attrs) { - OpTaskBinding b; - - b.bind_arg(ATTRS, attrs); - b.bind_arg(INPUT_SHAPE, input_parallel_tensor_shape(0)); - - return {task_id_t::ELEMENTUNARY_INIT_TASK_ID, b}; -} - -OpTaskInvocation forward(ElementUnaryAttrs const &attrs) { - OpTaskBinding b; - - b.bind(INPUT, input_tensor(0)); - b.bind(OUTPUT, output_tensor(0)); - b.bind_arg(ATTRS, attrs); - - b.bind_arg(HANDLE, ff_handle()); - b.bind_arg(PROFILING, profiling_settings()); - b.bind_arg(PER_DEVICE_STATE, - per_device_op_state()); - - return {task_id_t::ELEMENTUNARY_FWD_TASK_ID, b}; -} - -OpTaskInvocation backward(ElementUnaryAttrs const &attrs) { - OpTaskBinding b = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::ELEMENTUNARY_BWD_TASK_ID, b}; -} - -static DeviceSpecificDeviceStates - init_task_impl(TaskArgumentAccessor const &acc) { - - auto attrs = acc.get_argument(ATTRS); - - ParallelTensorShape input_shape = - acc.get_argument(INPUT_SHAPE); - - ParallelTensorShape output_shape = - throw_if_unexpected(get_output_shape(attrs, input_shape)); - ElementUnaryPerDeviceState per_device_state = - init_kernel(ArrayShape{get_piece_shape(input_shape)}, - ArrayShape{get_piece_shape(output_shape)}, - attrs); - - return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - auto attrs = acc.get_argument(ATTRS); - - auto handle = acc.get_argument(HANDLE); - - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); - - return profile(forward_kernel, - profiling, - "[ElementUnary] forward_time = {:.2lf}ms\n", - per_device_state, - attrs, - handle, - input, - output); -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - auto input = acc.get_tensor(INPUT); - auto input_grad = acc.get_tensor_grad(INPUT); - auto output = acc.get_tensor(OUTPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); - - auto const &attrs = acc.get_argument(ATTRS); - auto handle = acc.get_argument(HANDLE); - - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); - ProfilingSettings profiling = acc.get_argument(PROFILING); - - return profile(backward_kernel, - profiling, - "[ElementUnary] backward_time = {:.2lf}ms\n", - per_device_state, - attrs, - handle, - input, - input_grad, - output, - output_grad); -} - -TaskImplFunction get_element_unary_init_task_impl() { - return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; -} -TaskImplFunction get_element_unary_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} -TaskImplFunction get_element_unary_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -OpTaskSignature get_element_unary_init_signature() { - OpTaskSignature init(OpTaskType::INIT); - - init.add_arg_slot(INPUT_SHAPE); - init.add_arg_slot(ATTRS); - init.add_unchecked_arg_slot(HANDLE); - - init.add_return_value(); - - return init; -} - -OpTaskSignature get_element_unary_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_input_slot(INPUT); - fwd.add_output_slot(OUTPUT); - - fwd.add_arg_slot(PROFILING); - fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); - - return fwd; -} - -OpTaskSignature get_element_unary_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_element_unary_fwd_signature()); - - return bwd; -} - -std::vector get_task_ids(ElementUnaryAttrs const &) { - return {task_id_t::ELEMENTUNARY_INIT_TASK_ID, - task_id_t::ELEMENTUNARY_FWD_TASK_ID, - task_id_t::ELEMENTUNARY_BWD_TASK_ID}; -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/flat.cc b/lib/local-execution/src/local-execution/ops/flat.cc deleted file mode 100644 index 87295c2297..0000000000 --- a/lib/local-execution/src/local-execution/ops/flat.cc +++ /dev/null @@ -1,81 +0,0 @@ -#include "local-execution/ops/flat.h" -#include "kernels/flat_kernels.h" - -namespace FlexFlow { - -using namespace FlexFlow::Kernels::Flat; - -enum SLOTS { INPUT, OUTPUT, HANDLE, PROFILING }; - -OpTaskInvocation forward(FlatAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - - binding.bind_arg(PROFILING, profiling_settings()); - return {task_id_t::FLAT_FWD_TASK_ID, binding}; -} - -OpTaskInvocation backward(FlatAttrs const &attrs) { - OpTaskBinding b = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::FLAT_BWD_TASK_ID, b}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - - return profile(forward_kernel, - profiling, - "[Flat] forward_time = {:.2lf}ms\n", - input, - output.get_float_ptr()); -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - - auto input = acc.get_tensor(INPUT); - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); - - return profile(backward_kernel, - profiling, - "[Flat] backward_time = {:.2lf}ms\n", - input, - input_grad.get_float_ptr(), - output_grad.get_float_ptr()); -} - -TaskImplFunction get_flat_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} -TaskImplFunction get_flat_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -OpTaskSignature get_flat_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_arg_slot(PROFILING); - fwd.add_input_slot(INPUT); - fwd.add_output_slot(OUTPUT); - - return fwd; -} - -OpTaskSignature get_flat_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_flat_fwd_signature()); - - return bwd; -} - -std::vector get_task_ids(FlatAttrs const &) { - return {task_id_t::FLAT_FWD_TASK_ID, task_id_t::FLAT_BWD_TASK_ID}; -} - -}; // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/gather.cc b/lib/local-execution/src/local-execution/ops/gather.cc deleted file mode 100644 index 7e4b99a557..0000000000 --- a/lib/local-execution/src/local-execution/ops/gather.cc +++ /dev/null @@ -1,174 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "local-execution/ops/gather.h" -#include "kernels/gather_kernels.h" -#include "utils/nonnegative_int/nonnegative_range.h" -#include - -namespace FlexFlow { - -using namespace FlexFlow::Kernels::Gather; - -enum Slots { INPUT, OUTPUT, INDEX, ATTRS, HANDLE, PROFILING, PER_DEVICE_STATE }; - -OpTaskInvocation init(GatherAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind(INPUT, input_tensor(0)); - binding.bind(INDEX, input_tensor(1)); - binding.bind(OUTPUT, output_tensor(0)); - binding.bind_arg(ATTRS, attrs); - binding.bind_arg(HANDLE, ff_handle()); - - return {task_id_t::GATHER_INIT_TASK_ID, binding}; -} - -OpTaskInvocation forward(GatherAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind_arg(ATTRS, attrs); - binding.bind_arg(PROFILING, profiling_settings()); - binding.bind_arg(PER_DEVICE_STATE, - per_device_op_state()); - - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - binding.bind(INDEX, weight_tensor(0)); - - return {task_id_t::GATHER_FWD_TASK_ID, binding}; -} - -OpTaskInvocation backward(GatherAttrs const &attrs) { - OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::GATHER_BWD_TASK_ID, binding}; -} - -static DeviceSpecificDeviceStates - init_task_impl(TaskArgumentAccessor const &acc) { - auto input = acc.get_tensor(INPUT); - auto index = acc.get_tensor(INDEX); - auto output = acc.get_tensor(OUTPUT); - - PerDeviceFFHandle handle = acc.get_argument(HANDLE); - auto const &attrs = acc.get_argument(ATTRS); - legion_dim_t legion_dim = - legion_dim_from_ff_dim(attrs.dim, input.shape.num_dims()); - - assert(input.shape.get_dim() == index.shape.get_dim()); - assert(output.shape.get_dim() == index.shape.get_dim()); - - for (nonnegative_int i : nonnegative_range(input.shape.get_dim())) { - assert(index.shape.at(legion_dim_t{i}) == output.shape.at(legion_dim_t{i})); - if (i != legion_dim.value) { - assert(input.shape.at(legion_dim_t{i}) == - index.shape.at(legion_dim_t{i})); - } - } - - GatherPerDeviceState per_device_state = {handle, legion_dim}; - return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); - - auto input = acc.get_tensor(INPUT); - auto index = acc.get_tensor(INDEX); - auto output = acc.get_tensor(OUTPUT); - - return profile(forward_kernel, - profiling, - "[Gather] forward_time = {:.2lf}ms\n", - per_device_state, - input, - index, - output); -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); - - auto output_grad = acc.get_tensor_grad(OUTPUT); - auto index = acc.get_tensor(INDEX); - auto input_grad = acc.get_tensor_grad(INPUT); - - return profile(backward_kernel, - profiling, - "[Gather] backward_time = {:.2lf}ms\n", - per_device_state, - output_grad, - index, - input_grad); -} - -TaskImplFunction get_gather_init_task_impl() { - return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; -} -TaskImplFunction get_gather_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} -TaskImplFunction get_gather_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -OpTaskSignature get_gather_init_signature() { - OpTaskSignature init(OpTaskType::INIT); - - init.add_input_slot(INPUT); - init.add_input_slot(INDEX); - init.add_output_slot(OUTPUT); - - init.add_arg_slot(ATTRS); - init.add_unchecked_arg_slot(HANDLE); - - init.add_return_value(); - - return init; -} - -OpTaskSignature get_gather_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_arg_slot(PROFILING); - fwd.add_arg_slot(ATTRS); - - fwd.add_input_slot(INPUT); - fwd.add_output_slot(OUTPUT); - fwd.add_weight_slot(INDEX); - - return fwd; -} - -OpTaskSignature get_gather_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_gather_fwd_signature()); - - return bwd; -} - -std::vector get_task_ids(GatherAttrs const &) { - return {task_id_t::GATHER_INIT_TASK_ID, - task_id_t::GATHER_FWD_TASK_ID, - task_id_t::GATHER_BWD_TASK_ID}; -} - -}; // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/input.cc b/lib/local-execution/src/local-execution/ops/input.cc deleted file mode 100644 index d7a3888220..0000000000 --- a/lib/local-execution/src/local-execution/ops/input.cc +++ /dev/null @@ -1,9 +0,0 @@ -#include "local-execution/ops/input.h" - -namespace FlexFlow { - -std::vector get_task_ids(InputAttrs const &attrs) { - return {}; -} - -}; // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/layer_norm.cc b/lib/local-execution/src/local-execution/ops/layer_norm.cc deleted file mode 100644 index d2fc930375..0000000000 --- a/lib/local-execution/src/local-execution/ops/layer_norm.cc +++ /dev/null @@ -1,190 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "local-execution/ops/layer_norm.h" -#include "kernels/layer_norm_kernels.h" -#include "op-attrs/ops/layer_norm.h" -#include "op-attrs/parallel_tensor_shape.h" -#include "utils/exception.h" -#include "utils/hash-utils.h" -#include "utils/nonnegative_int/nonnegative_range.h" -#include - -namespace FlexFlow { - -using namespace FlexFlow::Kernels::LayerNorm; - -enum Slots { - PROFILING, - INPUT, - OUTPUT, - GAMMA, - BETA, - PER_DEVICE_STATE, - ATTRS, - HANDLE -}; - -OpTaskInvocation init(LayerNormAttrs const &attrs) { - OpTaskBinding b; - - b.bind(INPUT, input_tensor(0)); - - b.bind_arg(HANDLE, ff_handle()); - b.bind_arg(ATTRS, attrs); - - return {task_id_t::LAYERNORM_INIT_TASK_ID, b}; -} - -OpTaskInvocation forward(LayerNormAttrs const &attrs) { - OpTaskBinding b; - - b.bind(INPUT, input_tensor(0)); - b.bind(OUTPUT, output_tensor(0)); - b.bind(GAMMA, weight_tensor(0)); // todo, this may have some problem - b.bind(BETA, weight_tensor(1)); // how to get gmmam and beta - b.bind_arg(PROFILING, profiling_settings()); - b.bind_arg(PER_DEVICE_STATE, per_device_op_state()); - - return {task_id_t::LAYERNORM_FWD_TASK_ID, b}; -} - -OpTaskInvocation backward(LayerNormAttrs const &attrs) { - OpTaskBinding b = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::LAYERNORM_BWD_TASK_ID, b}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - auto gamma = acc.get_tensor(GAMMA); - auto beta = acc.get_tensor(BETA); - - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto &state = acc.get_argument(PER_DEVICE_STATE); - - return profile(forward_kernel, - profiling, - "[LayerNorm] forward time = {:.2lf}ms\n", - state, - input, - output, - gamma, - beta); -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - auto input = acc.get_tensor(INPUT); - auto gamma = acc.get_tensor(GAMMA); - - auto input_grad = acc.get_tensor_grad(INPUT); - auto gamma_grad = acc.get_tensor_grad(GAMMA); - auto beta_grad = acc.get_tensor_grad(BETA); - auto output_grad = acc.get_tensor_grad(OUTPUT); - - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto &state = acc.get_argument(PER_DEVICE_STATE); - - return profile(backward_kernel, - profiling, - "[LayerNorm] backward time = {:.2lf}ms\n", - state, - output_grad, - input, - input_grad, - gamma, - gamma_grad, - beta_grad); -} - -static DeviceSpecificDeviceStates - init_task_impl(TaskArgumentAccessor const &acc) { - auto const &attrs = acc.get_argument(ATTRS); - Allocator allocator = acc.get_allocator(); - auto input = acc.get_tensor(INPUT); - auto handle = acc.get_argument(HANDLE); - - nonnegative_int M = 1_n; - for (int i = 0; i < attrs.axes.size(); i++) { - legion_dim_t legion_dim = - legion_dim_from_ff_dim(attrs.axes[i], input.shape.num_dims()); - M *= input.shape.at(legion_dim); - } - nonnegative_int num_replicas = 1_n; - for (nonnegative_int i : nonnegative_range(input.shape.num_dims())) { - num_replicas *= input.shape.at(legion_dim_t{i}); - } - nonnegative_int effective_num_elements = M; - nonnegative_int effective_batch_size = input.shape.get_volume() / M; - - LayerNormPerDeviceState per_device_state = - init_kernel(handle, - allocator, - attrs.elementwise_affine, - effective_batch_size.unwrap_nonnegative(), - effective_num_elements.unwrap_nonnegative(), - attrs.eps); - return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; -} - -TaskImplFunction get_layer_norm_init_task_impl() { - return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; -} -TaskImplFunction get_layer_norm_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} -TaskImplFunction get_layer_norm_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -OpTaskSignature get_layer_norm_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_input_slot(INPUT); - fwd.add_output_slot(OUTPUT); - fwd.add_weight_slot(GAMMA); - fwd.add_weight_slot(BETA); - - fwd.add_arg_slot(PROFILING); - fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); - return fwd; -} - -OpTaskSignature get_layer_norm_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_layer_norm_fwd_signature()); - return bwd; -} - -OpTaskSignature get_layer_norm_init_signature() { - OpTaskSignature init(OpTaskType::INIT); - - init.add_input_slot(INPUT); - init.add_arg_slot(ATTRS); - init.add_unchecked_arg_slot(HANDLE); - - init.add_return_value(); - return init; -} - -std::vector get_task_ids(LayerNormAttrs const &) { - return {task_id_t::LAYERNORM_INIT_TASK_ID, - task_id_t::LAYERNORM_FWD_TASK_ID, - task_id_t::LAYERNORM_BWD_TASK_ID}; -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/linear.cc b/lib/local-execution/src/local-execution/ops/linear.cc deleted file mode 100644 index 768293b32f..0000000000 --- a/lib/local-execution/src/local-execution/ops/linear.cc +++ /dev/null @@ -1,210 +0,0 @@ -#include "local-execution/ops/linear.h" -#include "kernels/linear_kernels.h" -#include "local-execution/task_argument_accessor.h" -#include "op-attrs/ff_dim_t.h" -#include "utils/exception.h" -#include "utils/hash-utils.h" - -namespace FlexFlow { - -using namespace FlexFlow::Kernels::Linear; - -enum slots { - INPUT, - OUTPUT, - WEIGHT, - BIAS, - ATTRS, - PROFILING, - HANDLE, - PER_DEVICE_STATE -}; - -OpTaskInvocation init(LinearAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind_arg(HANDLE, ff_handle()); - binding.bind_arg(ATTRS, attrs); - - binding.bind(INPUT, input_tensor(0)); // input - binding.bind(WEIGHT, weight_tensor(0)); // weight - binding.bind(OUTPUT, output_tensor(0)); // output - - return {task_id_t::LINEAR_INIT_TASK_ID, binding}; -} - -OpTaskInvocation forward(LinearAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind(INPUT, input_tensor(0)); // input - binding.bind(WEIGHT, weight_tensor(0)); // weight - binding.bind(OUTPUT, output_tensor(0)); // output - if (attrs.use_bias) { - binding.bind(BIAS, weight_tensor(1)); // bias - } - - binding.bind_arg(PROFILING, profiling_settings()); - binding.bind_arg(PER_DEVICE_STATE, - per_device_op_state()); - binding.bind_arg(ATTRS, attrs); - - return {task_id_t::LINEAR_FWD_TASK_ID, binding}; -} - -OpTaskInvocation backward(LinearAttrs const &attrs) { - OpTaskBinding b = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::LINEAR_BWD_TASK_ID, b}; -} - -static DeviceSpecificDeviceStates - init_task_impl(TaskArgumentAccessor const &acc) { - auto const &attrs = acc.get_argument(ATTRS); - PerDeviceFFHandle handle = acc.get_argument(HANDLE); - - auto input = acc.get_tensor(INPUT); - auto weight = acc.get_tensor(WEIGHT); - auto output = acc.get_tensor(OUTPUT); - nonnegative_int out_dim = output.shape.at(ff_dim_t{0_n}); - nonnegative_int batch_size = output.shape.at(ff_dim_t{1_n}); - - float *one_ptr; - - LinearPerDeviceState per_device_state = - init_kernel(handle, - one_ptr, - attrs.activation, - attrs.regularizer, - attrs.use_bias, - input.data_type, - weight.data_type, - output.data_type, - batch_size.unwrap_nonnegative(), - attrs.out_channels.unwrap_nonnegative()); - return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - auto input = acc.get_tensor(INPUT); - auto weight = acc.get_tensor(WEIGHT); - auto output = acc.get_tensor(OUTPUT); - - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto attrs = acc.get_argument(ATTRS); - - nonnegative_int in_dim = input.shape.at(ff_dim_t{0_n}); - nonnegative_int out_dim = output.shape.at(ff_dim_t{0_n}); - nonnegative_int batch_size = output.shape.get_volume() / out_dim; - - float const *bias_ptr = NULL; - if (attrs.use_bias) { - auto bias = acc.get_tensor(BIAS); - bias_ptr = bias.get_float_ptr(); - } - - return profile(forward_kernel, - profiling, - "[Linear] forward_time = {:.2lf}ms\n", - per_device_state, - input.get_float_ptr(), - output.get_float_ptr(), - weight.get_float_ptr(), - bias_ptr, - in_dim.unwrap_nonnegative(), - out_dim.unwrap_nonnegative(), - batch_size.unwrap_nonnegative()); -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - auto input = acc.get_tensor(INPUT); - auto weight = acc.get_tensor(WEIGHT); - auto output = acc.get_tensor(OUTPUT); - - auto input_grad = acc.get_tensor_grad(INPUT); - auto weight_grad = acc.get_tensor_grad(WEIGHT); - auto output_grad = acc.get_tensor_grad(OUTPUT); - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto attrs = acc.get_argument(ATTRS); - - float const *bias_ptr = NULL; - if (attrs.use_bias) { - auto bias = acc.get_tensor(BIAS); - bias_ptr = bias.get_float_ptr(); - } - - nonnegative_int in_dim = input.shape.at(ff_dim_t{0_n}); - nonnegative_int out_dim = output.shape.at(ff_dim_t{0_n}); - nonnegative_int batch_size = output.shape.get_volume() / out_dim; - - return profile(backward_kernel, - profiling, - "[Linear] backward_time = {:.2lf}ms\n", - per_device_state, - (void *)input.get_float_ptr(), - (void *)input_grad.get_float_ptr(), - (void *)output.get_float_ptr(), - (void *)output_grad.get_float_ptr(), - (void *)weight.get_float_ptr(), - (void *)weight_grad.get_float_ptr(), - (void *)bias_ptr, - in_dim.unwrap_nonnegative(), - out_dim.unwrap_nonnegative(), - batch_size.unwrap_nonnegative()); -} - -TaskImplFunction get_linear_init_task_impl() { - return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; -} -TaskImplFunction get_linear_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} -TaskImplFunction get_linear_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -OpTaskSignature get_linear_init_signature() { - OpTaskSignature init(OpTaskType::INIT); - - init.add_input_slot(INPUT); - init.add_weight_slot(WEIGHT); - init.add_output_slot(OUTPUT); - - init.add_arg_slot(ATTRS); - init.add_unchecked_arg_slot(HANDLE); - - init.add_return_value(); - return init; -} - -OpTaskSignature get_linear_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_input_slot(INPUT); - fwd.add_weight_slot(WEIGHT); - fwd.add_optional_weight_slot(BIAS); - fwd.add_output_slot(OUTPUT); - - fwd.add_arg_slot(PROFILING); - fwd.add_arg_slot(ATTRS); - fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); - return fwd; -} - -OpTaskSignature get_linear_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_linear_fwd_signature()); - return bwd; -} - -std::vector get_task_ids(LinearAttrs const &) { - return {task_id_t::LINEAR_INIT_TASK_ID, - task_id_t::LINEAR_FWD_TASK_ID, - task_id_t::LINEAR_BWD_TASK_ID}; -} - -}; // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/noop.cc b/lib/local-execution/src/local-execution/ops/noop.cc deleted file mode 100644 index 7357806880..0000000000 --- a/lib/local-execution/src/local-execution/ops/noop.cc +++ /dev/null @@ -1,24 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "local-execution/ops/noop.h" - -namespace FlexFlow { - -std::vector get_task_ids(NoopAttrs const &attrs) { - return {}; -} - -}; // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/pool_2d.cc b/lib/local-execution/src/local-execution/ops/pool_2d.cc deleted file mode 100644 index 8622732a4d..0000000000 --- a/lib/local-execution/src/local-execution/ops/pool_2d.cc +++ /dev/null @@ -1,176 +0,0 @@ -#include "local-execution/ops/pool_2d.h" -#include "kernels/pool_2d_kernels.h" - -#include "op-attrs/ops/pool_2d.h" -#include "utils/exception.h" -#include "utils/hash-utils.h" - -using namespace FlexFlow::Kernels::Pool2D; - -namespace FlexFlow { - -enum Slots { INPUT, OUTPUT, ATTRS, PROFILING, PER_DEVICE_STATE, HANDLE }; - -OpTaskInvocation init(Pool2DAttrs const &attrs) { - OpTaskBinding binding; - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - binding.bind_arg(ATTRS, attrs); - binding.bind_arg(HANDLE, ff_handle()); - - return {task_id_t::POOL2D_INIT_TASK_ID, binding}; -} - -static nonnegative_int calculate_padding(nonnegative_int output_size, - nonnegative_int stride, - nonnegative_int kernel_size, - nonnegative_int input_size) { - int o = output_size.unwrap_nonnegative(); - int s = stride.unwrap_nonnegative(); - int k = kernel_size.unwrap_nonnegative(); - int i = kernel_size.unwrap_nonnegative(); - - return nonnegative_int{ - ((o - 1) * s + k - i + 1) / 2, - }; -} - -static DeviceSpecificDeviceStates - init_task_impl(TaskArgumentAccessor const &acc) { - auto const &attrs = acc.get_argument(ATTRS); - PerDeviceFFHandle handle = acc.get_argument(HANDLE); - - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - - nonnegative_int input_w = input.shape.at(ff_dim_t{0_n}); - nonnegative_int input_h = input.shape.at(ff_dim_t{1_n}); - nonnegative_int input_c = input.shape.at(ff_dim_t{2_n}); - nonnegative_int input_n = input.shape.at(ff_dim_t{3_n}); - nonnegative_int output_w = output.shape.at(ff_dim_t{0_n}); - nonnegative_int output_h = output.shape.at(ff_dim_t{1_n}); - nonnegative_int output_c = output.shape.at(ff_dim_t{2_n}); - nonnegative_int output_n = output.shape.at(ff_dim_t{3_n}); - - Pool2DPerDeviceState per_device_state = - init_kernel(handle, - attrs.activation, - input_w.unwrap_nonnegative(), - input_h.unwrap_nonnegative(), - input_c.unwrap_nonnegative(), - input_n.unwrap_nonnegative(), - output_w.unwrap_nonnegative(), - output_h.unwrap_nonnegative(), - output_c.unwrap_nonnegative(), - output_n.unwrap_nonnegative(), - attrs.padding_h.unwrap_nonnegative(), - attrs.padding_w.unwrap_nonnegative(), - attrs.kernel_h.unwrap_nonnegative(), - attrs.kernel_w.unwrap_nonnegative(), - attrs.stride_h.unwrap_nonnegative(), - attrs.stride_w.unwrap_nonnegative(), - attrs.pool_type); - - return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; -} - -OpTaskInvocation forward(Pool2DAttrs const &attrs) { - OpTaskBinding binding; - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - - binding.bind_arg(PROFILING, profiling_settings()); - binding.bind_arg(PER_DEVICE_STATE, - per_device_op_state()); - - return {task_id_t::POOL2D_FWD_TASK_ID, binding}; -} - -OpTaskInvocation backward(Pool2DAttrs const &attrs) { - OpTaskBinding b = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::POOL2D_BWD_TASK_ID, b}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - Pool2DPerDeviceState state = - acc.get_argument(PER_DEVICE_STATE); - - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - - return profile(forward_kernel, - profiling, - "[Pool2D] forward_time = {:.2lf}ms\n", - state, - input.get_float_ptr(), - output.get_float_ptr()); -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - Pool2DPerDeviceState state = - acc.get_argument(PER_DEVICE_STATE); - - auto input = acc.get_tensor(INPUT); - auto input_grad = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - auto output_grad = acc.get_tensor(OUTPUT); - - return profile(backward_kernel, - profiling, - "[Pool2D] backward_time = {:.2lf}ms\n", - state, - input.get_float_ptr(), - input_grad.get_float_ptr(), - output.get_float_ptr(), - output_grad.get_float_ptr()); -} - -TaskImplFunction get_pool_2d_init_task_impl() { - return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; -} -TaskImplFunction get_pool_2d_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} -TaskImplFunction get_pool_2d_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -OpTaskSignature get_pool_2d_init_signature() { - OpTaskSignature init(OpTaskType::INIT); - - init.add_input_slot(INPUT); - init.add_output_slot(OUTPUT); - - init.add_arg_slot(ATTRS); - init.add_unchecked_arg_slot(HANDLE); - - init.add_return_value(); - return init; -} -OpTaskSignature get_pool_2d_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_input_slot(INPUT); - fwd.add_output_slot(OUTPUT); - fwd.add_arg_slot(PROFILING); - - fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); - return fwd; -} -OpTaskSignature get_pool_2d_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_pool_2d_fwd_signature()); - return bwd; -} - -std::vector get_task_ids(Pool2DAttrs const &) { - return {task_id_t::POOL2D_INIT_TASK_ID, - task_id_t::POOL2D_FWD_TASK_ID, - task_id_t::POOL2D_BWD_TASK_ID}; -} - -}; // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/reduce.cc b/lib/local-execution/src/local-execution/ops/reduce.cc deleted file mode 100644 index bc4b5343c2..0000000000 --- a/lib/local-execution/src/local-execution/ops/reduce.cc +++ /dev/null @@ -1,148 +0,0 @@ -#include "local-execution/ops/reduce.h" -#include "kernels/reduce_kernels.h" - -#include "utils/exception.h" -#include "utils/hash-utils.h" -#include "utils/type_traits_core.h" - -namespace FlexFlow { - -using namespace FlexFlow::Kernels::Reduce; - -enum Slots { - INPUT, - OUTPUT, - ATTRS, - PROFILING, - REDUCE, - PER_DEVICE_STATE, - HANDLE -}; - -OpTaskInvocation init(ReduceAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind_arg(HANDLE, ff_handle()); - binding.bind_arg(ATTRS, attrs); - - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - - return {task_id_t::REDUCE_INIT_TASK_ID, binding}; -} - -static DeviceSpecificDeviceStates - init_task_impl(TaskArgumentAccessor const &acc) { - PerDeviceFFHandle handle = acc.get_argument(HANDLE); - auto attrs = acc.get_argument(ATTRS); - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - - OperatorType op_type = attrs.op_type; - - nonnegative_int reduction_size = - input.shape.get_volume() / output.shape.get_volume(); - ReducePerDeviceState per_device_state = - init_kernel(handle, - op_type, - reduction_size.unwrap_nonnegative(), - input.shape, - output.shape); - return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; -} - -// Note: forward_kernel only needs ReducePerDeviceState, input, output -OpTaskInvocation forward(ReduceAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind_arg(PER_DEVICE_STATE, - per_device_op_state()); - binding.bind_arg(PROFILING, profiling_settings()); - - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - - return {task_id_t::REDUCE_FWD_TASK_ID, binding}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); - ProfilingSettings profiling = acc.get_argument(PROFILING); - - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - - return profile(forward_kernel, - profiling, - "[Reduce] forward_time = {:.2lf}ms\n", - per_device_state, - input.get_float_ptr(), - output.get_float_ptr()); -} - -OpTaskInvocation backward(ReduceAttrs const &attrs) { - OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::REDUCE_BWD_TASK_ID, binding}; -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); - ProfilingSettings profiling = acc.get_argument(PROFILING); - - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); - - return profile(backward_kernel, - profiling, - "[Reduce] backward_time = {:.2lf}ms\n", - per_device_state, - output_grad.get_float_ptr(), - input_grad.get_float_ptr()); -} - -TaskImplFunction get_reduce_init_task_impl() { - return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; -} -TaskImplFunction get_reduce_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} -TaskImplFunction get_reduce_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -OpTaskSignature get_reduce_init_signature() { - OpTaskSignature init(OpTaskType::INIT); - - init.add_unchecked_arg_slot(HANDLE); - init.add_arg_slot(ATTRS); - - init.add_return_value(); - return init; -} -OpTaskSignature get_reduce_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); - fwd.add_arg_slot(PROFILING); - - fwd.add_input_slot(INPUT); - fwd.add_output_slot(OUTPUT); - return fwd; -} -OpTaskSignature get_reduce_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_reduce_fwd_signature()); - return bwd; -} - -std::vector get_task_ids(ReduceAttrs const &) { - return {task_id_t::REDUCE_INIT_TASK_ID, - task_id_t::REDUCE_FWD_TASK_ID, - task_id_t::REDUCE_BWD_TASK_ID}; -} - -}; // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/reduction.cc b/lib/local-execution/src/local-execution/ops/reduction.cc deleted file mode 100644 index 5e90b30fac..0000000000 --- a/lib/local-execution/src/local-execution/ops/reduction.cc +++ /dev/null @@ -1,101 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "local-execution/ops/reduction.h" -#include "kernels/reduction_kernels.h" -#include "utils/exception.h" -#include "utils/hash-utils.h" - -namespace FlexFlow { - -using namespace FlexFlow::Kernels::Reduction; - -enum Slots { INPUT, OUTPUT, ATTRS, PROFILING }; - -OpTaskInvocation forward(ReductionAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind_arg(PROFILING, profiling_settings()); - binding.bind_arg(ATTRS, attrs); - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - - return {task_id_t::REDUCTION_FWD_TASK_ID, binding}; -} - -OpTaskInvocation backward(ReductionAttrs const &attrs) { - OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::REDUCTION_BWD_TASK_ID, binding}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling_settings = - acc.get_argument(PROFILING); - - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - auto attrs = acc.get_argument(ATTRS); - - nonnegative_int num_replicas = attrs.reduction_degree; - - return profile(forward_kernel, - profiling_settings, - "[Reduction] forward_time = {:.2lf}ms\n", - input, - output, - num_replicas.unwrap_nonnegative()); -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); - return profile(backward_kernel, - profiling, - "[Reduction] backward_time = {:.2lf}ms\n", - input_grad, - output_grad); -} - -TaskImplFunction get_reduction_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} -TaskImplFunction get_reduction_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -OpTaskSignature get_reduction_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_arg_slot(PROFILING); - fwd.add_arg_slot(ATTRS); - - fwd.add_input_slot(INPUT); - fwd.add_output_slot(OUTPUT); - return fwd; -} -OpTaskSignature get_reduction_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_reduction_fwd_signature()); - return bwd; -} - -std::vector get_task_ids(ReductionAttrs const &) { - return {task_id_t::REDUCTION_FWD_TASK_ID, task_id_t::REDUCTION_BWD_TASK_ID}; -} - -}; // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/repartition.cc b/lib/local-execution/src/local-execution/ops/repartition.cc deleted file mode 100644 index c1b3bbe3c6..0000000000 --- a/lib/local-execution/src/local-execution/ops/repartition.cc +++ /dev/null @@ -1,137 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "local-execution/ops/repartition.h" -#include "kernels/partition_kernels.h" -#include "utils/exception.h" -#include "utils/hash-utils.h" - -namespace FlexFlow { - -using namespace FlexFlow::Kernels::Repartition; - -enum Slots { INPUT, OUTPUT, ATTRS, PROFILING, HANDLE, PER_DEVICE_STATE }; - -OpTaskInvocation init(RepartitionAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind_arg(HANDLE, ff_handle()); - binding.bind(INPUT, input_tensor(0)); - - return {task_id_t::REPARTITION_INIT_TASK_ID, binding}; -} - -OpTaskInvocation forward(RepartitionAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind_arg(PROFILING, profiling_settings()); - binding.bind_arg(ATTRS, attrs); - binding.bind_arg(PER_DEVICE_STATE, - per_device_op_state()); - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - - return {task_id_t::REPARTITION_FWD_TASK_ID, binding}; -} - -OpTaskInvocation backward(RepartitionAttrs const &attrs) { - OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::REPARTITION_BWD_TASK_ID, binding}; -} - -static DeviceSpecificDeviceStates - init_task_impl(TaskArgumentAccessor const &acc) { - auto input = acc.get_tensor(INPUT); - PerDeviceFFHandle handle = acc.get_argument(HANDLE); - - // Note: use the input data type - - RepartitionPerDeviceState per_device_state = - init_kernel(handle, input.data_type); - return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - - return profile(forward_kernel, - profiling, - "[Reparition/Partition] forward_time = {:.2lf}ms\n", - per_device_state, - input, - output); -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); - - return profile(backward_kernel, - profiling, - "[Reparition/Partition] backward_time = {:.2lf}ms\n", - per_device_state, - output_grad, - input_grad); -} - -TaskImplFunction get_repartition_init_task_impl() { - return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; -} -TaskImplFunction get_repartition_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} -TaskImplFunction get_repartition_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -OpTaskSignature get_repartition_init_signature() { - OpTaskSignature init(OpTaskType::INIT); - - init.add_unchecked_arg_slot(HANDLE); - init.add_input_slot(INPUT); - init.add_return_value(); - return init; -} -OpTaskSignature get_repartition_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_input_slot(INPUT); - fwd.add_output_slot(OUTPUT); - fwd.add_arg_slot(PROFILING); - fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); - return fwd; -} -OpTaskSignature get_repartition_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_repartition_fwd_signature()); - return bwd; -} - -std::vector get_task_ids(RepartitionAttrs const &) { - return {task_id_t::REPARTITION_INIT_TASK_ID, - task_id_t::REPARTITION_FWD_TASK_ID, - task_id_t::REPARTITION_BWD_TASK_ID}; -} - -}; // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/replicate.cc b/lib/local-execution/src/local-execution/ops/replicate.cc deleted file mode 100644 index ea5be55409..0000000000 --- a/lib/local-execution/src/local-execution/ops/replicate.cc +++ /dev/null @@ -1,99 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "local-execution/ops/replicate.h" -#include "kernels/replicate_kernels.h" -#include "op-attrs/parallel_tensor_shape.h" -#include "utils/exception.h" -#include "utils/hash-utils.h" - -namespace FlexFlow { - -using namespace FlexFlow::Kernels::Replicate; - -enum Slots { INPUT, OUTPUT, ATTRS, PROFILING }; - -OpTaskInvocation forward(ReplicateAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind_arg(PROFILING, profiling_settings()); - - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - binding.bind_arg(ATTRS, attrs); - - return {task_id_t::REPLICATE_FWD_TASK_ID, binding}; -} -OpTaskInvocation backward(ReplicateAttrs const &attrs) { - OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::REPLICATE_BWD_TASK_ID, binding}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - - return profile(forward_kernel, - profiling, - "[replicate] forward_time = {:.2lf}ms\n", - input, - output); -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); - auto attrs = acc.get_argument(ATTRS); - - return profile(backward_kernel, - profiling, - "[replicate] backward_time = {:.2lf}ms\n", - input_grad, - output_grad, - attrs.replicate_degree.unwrap_nonnegative()); -} - -TaskImplFunction get_replicate_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} -TaskImplFunction get_replicate_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -OpTaskSignature get_replicate_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_arg_slot(PROFILING); - fwd.add_input_slot(INPUT); - fwd.add_output_slot(OUTPUT); - return fwd; -} - -OpTaskSignature get_replicate_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_replicate_fwd_signature()); - return bwd; -} - -std::vector get_task_ids(ReplicateAttrs const &) { - return {task_id_t::REPLICATE_FWD_TASK_ID, task_id_t::REPLICATE_BWD_TASK_ID}; -} - -}; // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/reshape.cc b/lib/local-execution/src/local-execution/ops/reshape.cc deleted file mode 100644 index f04785c904..0000000000 --- a/lib/local-execution/src/local-execution/ops/reshape.cc +++ /dev/null @@ -1,132 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "local-execution/ops/reshape.h" -#include "kernels/reshape_kernels.h" - -namespace FlexFlow { - -using namespace FlexFlow::Kernels::Reshape; - -enum slots { INPUT, OUTPUT, ATTRS, PROFILING, PER_DEVICE_STATE }; - -OpTaskInvocation init(ReshapeAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind_arg(ATTRS, attrs); - - return {task_id_t::RESHAPE_INIT_TASK_ID, binding}; -} - -OpTaskInvocation forward(ReshapeAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind_arg(PER_DEVICE_STATE, - per_device_op_state()); - binding.bind_arg(PROFILING, profiling_settings()); - - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - return {task_id_t::RESHAPE_FWD_TASK_ID, binding}; -} - -OpTaskInvocation backward(ReshapeAttrs const &attrs) { - OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::RESHAPE_BWD_TASK_ID, binding}; -} - -static DeviceSpecificDeviceStates - init_task_impl(TaskArgumentAccessor const &acc) { - auto attrs = acc.get_argument(ATTRS); - - ReshapePerDeviceState per_device_state = init_kernel(attrs.shape.data_type); - return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); - ProfilingSettings profiling = acc.get_argument(PROFILING); - - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - - return profile(forward_kernel, - profiling, - "[Reshape] forward time = {:.2lf}ms\n", - per_device_state, - input, - output); -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); - ProfilingSettings profiling = acc.get_argument(PROFILING); - - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); - - return profile(backward_kernel, - profiling, - "[Reshape] backward time = {:.2lf}ms\n", - per_device_state, - input_grad, - output_grad); -} - -TaskImplFunction get_reshape_init_task_impl() { - return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; -} -TaskImplFunction get_reshape_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} -TaskImplFunction get_reshape_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -OpTaskSignature get_reshape_init_signature() { - OpTaskSignature init(OpTaskType::INIT); - - init.add_arg_slot(ATTRS); - - init.add_return_value(); - return init; -} -OpTaskSignature get_reshape_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_arg_slot(PROFILING); - fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); - - fwd.add_input_slot(INPUT); - fwd.add_output_slot(OUTPUT); - return fwd; -} -OpTaskSignature get_reshape_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_reshape_fwd_signature()); - return bwd; -} - -std::vector get_task_ids(ReshapeAttrs const &) { - return {task_id_t::RESHAPE_INIT_TASK_ID, - task_id_t::RESHAPE_FWD_TASK_ID, - task_id_t::RESHAPE_BWD_TASK_ID}; -} - -}; // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/reverse.cc b/lib/local-execution/src/local-execution/ops/reverse.cc deleted file mode 100644 index 66c0ef7c5e..0000000000 --- a/lib/local-execution/src/local-execution/ops/reverse.cc +++ /dev/null @@ -1,135 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "local-execution/ops/reverse.h" -#include "kernels/accessor.h" -#include "kernels/reverse_kernels.h" -#include "utils/nonnegative_int/nonnegative_range.h" - -namespace FlexFlow { - -using namespace FlexFlow::Kernels::Reverse; -using coord_t = long long; - -enum Slots { INPUT, OUTPUT, ATTRS, PROFILING }; - -OpTaskInvocation forward(ReverseAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind_arg(PROFILING, profiling_settings()); - binding.bind_arg(ATTRS, attrs); - - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - - return {task_id_t::REVERSE_FWD_TASK_ID, binding}; -} -OpTaskInvocation backward(ReverseAttrs const &attrs) { - OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::REVERSE_BWD_TASK_ID, binding}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - auto attrs = acc.get_argument(ATTRS); - - nonnegative_int output_size = output.shape.get_volume(); - auto axis = attrs.axis; - nonnegative_int in_blk_size = 1_n; - nonnegative_int reverse_dim_size = 1_n; - nonnegative_int num_out_blks = 1_n; - for (nonnegative_int i : nonnegative_range(output.shape.get_dim())) { - if (i < axis.value) { - in_blk_size *= output.shape.at(ff_dim_t{i}); - } else if (i == axis.value) { - reverse_dim_size = output.shape.at(ff_dim_t{i}); - } else { - num_out_blks *= output.shape.at(ff_dim_t{i}); - } - } - - return profile(forward_kernel, - profiling, - "[reverse] forward_time = {:.2lf}ms\n", - input.get_float_ptr(), - output.get_float_ptr(), - num_out_blks.unwrap_nonnegative(), - reverse_dim_size.unwrap_nonnegative(), - in_blk_size.unwrap_nonnegative(), - output_size.unwrap_nonnegative()); -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); - auto attrs = acc.get_argument(ATTRS); - - int axis = input_grad.shape.num_dims().unwrap_nonnegative() - - attrs.axis.value.unwrap_nonnegative() - 1; - nonnegative_int in_blk_size = 1_n; - nonnegative_int reverse_dim_size = 1_n; - nonnegative_int num_out_blks = 1_n; - for (nonnegative_int i : nonnegative_range(input_grad.shape.get_dim())) { - if (i < axis) { - in_blk_size *= input_grad.shape.at(ff_dim_t{i}); - } else if (i == axis) { - reverse_dim_size = input_grad.shape.at(ff_dim_t{i}); - } else { - num_out_blks *= input_grad.shape.at(ff_dim_t{i}); - } - } - - return profile(backward_kernel, - profiling, - "[reverse] backward_time = {:.2lf}ms\n", - output_grad.get_float_ptr(), - input_grad.get_float_ptr(), - num_out_blks.unwrap_nonnegative(), - reverse_dim_size.unwrap_nonnegative(), - in_blk_size.unwrap_nonnegative(), - input_grad.shape.get_volume().unwrap_nonnegative()); -} - -TaskImplFunction get_reverse_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} -TaskImplFunction get_reverse_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -OpTaskSignature get_reverse_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_arg_slot(PROFILING); - fwd.add_input_slot(INPUT); - fwd.add_output_slot(OUTPUT); - return fwd; -} - -OpTaskSignature get_reverse_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_reverse_fwd_signature()); - return bwd; -} - -std::vector get_task_ids(ReverseAttrs const &) { - return {task_id_t::REVERSE_FWD_TASK_ID, task_id_t::REVERSE_BWD_TASK_ID}; -} - -}; // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/softmax.cc b/lib/local-execution/src/local-execution/ops/softmax.cc deleted file mode 100644 index 02cebfc4a4..0000000000 --- a/lib/local-execution/src/local-execution/ops/softmax.cc +++ /dev/null @@ -1,153 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "local-execution/ops/softmax.h" -#include "kernels/softmax_kernels.h" -#include "op-attrs/parallel_tensor_shape.h" -#include "utils/exception.h" -#include "utils/hash-utils.h" - -namespace FlexFlow { -using namespace FlexFlow::Kernels::Softmax; - -enum Slots { INPUT, OUTPUT, ATTRS, PROFILING, PER_DEVICE_STATE, HANDLE }; - -OpTaskInvocation init(SoftmaxAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind_arg(HANDLE, ff_handle()); - binding.bind_arg(ATTRS, attrs); - return {task_id_t::SOFTMAX_INIT_TASK_ID, binding}; -} - -OpTaskInvocation forward(SoftmaxAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind_arg(PER_DEVICE_STATE, - per_device_op_state()); - binding.bind_arg(PROFILING, profiling_settings()); - - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - - return {task_id_t::SOFTMAX_FWD_TASK_ID, binding}; -} - -OpTaskInvocation backward(SoftmaxAttrs const &attrs) { - OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::SOFTMAX_BWD_TASK_ID, binding}; -} - -static DeviceSpecificDeviceStates - init_task_impl(TaskArgumentAccessor const &acc) { - PerDeviceFFHandle handle = acc.get_argument(HANDLE); - - auto output = acc.get_tensor(OUTPUT); - auto const &attrs = acc.get_argument(ATTRS); - - nonnegative_int output_w = output.shape.at(legion_dim_t{0_n}); - nonnegative_int output_h = output.shape.at(legion_dim_t{1_n}); - nonnegative_int output_c = output.shape.at(legion_dim_t{2_n}); - nonnegative_int output_n = output.shape.at(legion_dim_t{3_n}); - - SoftmaxPerDeviceState per_device_state = - init_kernel(handle, - attrs.dim.value.unwrap_nonnegative(), - output_n.unwrap_nonnegative(), - output_c.unwrap_nonnegative(), - output_h.unwrap_nonnegative(), - output_w.unwrap_nonnegative()); - - return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); - - return profile(forward_kernel, - profiling, - "[SoftMax] forward_time = {:.2lf}ms\n", - per_device_state, - input.get_float_ptr(), - output.get_float_ptr()); -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - - auto input_grad = acc.get_tensor_grad(INPUT); - auto input = acc.get_tensor(INPUT); - assert(input_grad.shape == input.shape); - - auto output_grad = acc.get_tensor_grad(OUTPUT); - auto output = acc.get_tensor(OUTPUT); - - assert(output_grad.shape == output.shape); - - return profile(backward_kernel, - profiling, - "[SoftMax] backward_time = {:.2lf}ms\n", - input_grad.get_float_ptr(), - output_grad.get_float_ptr(), - output_grad.shape.get_volume().unwrap_nonnegative()); -} - -TaskImplFunction get_softmax_init_task_impl() { - return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; -} -TaskImplFunction get_softmax_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} -TaskImplFunction get_softmax_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -OpTaskSignature get_softmax_init_signature() { - OpTaskSignature init(OpTaskType::INIT); - - init.add_unchecked_arg_slot(HANDLE); - init.add_arg_slot(ATTRS); - init.add_return_value(); - return init; -} -OpTaskSignature get_softmax_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_arg_slot(PROFILING); - fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); - - fwd.add_input_slot(INPUT); - fwd.add_output_slot(OUTPUT); - return fwd; -} -OpTaskSignature get_softmax_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_softmax_fwd_signature()); - return bwd; -} - -std::vector get_task_ids(SoftmaxAttrs const &) { - return {task_id_t::SOFTMAX_INIT_TASK_ID, - task_id_t::SOFTMAX_FWD_TASK_ID, - task_id_t::SOFTMAX_BWD_TASK_ID}; -} - -}; // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/split.cc b/lib/local-execution/src/local-execution/ops/split.cc deleted file mode 100644 index 5661fa7381..0000000000 --- a/lib/local-execution/src/local-execution/ops/split.cc +++ /dev/null @@ -1,140 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "local-execution/ops/split.h" -#include "kernels/array_shape.h" -#include "kernels/split_kernels.h" -#include "utils/exception.h" -#include "utils/hash-utils.h" -#include "utils/nonnegative_int/nonnegative_range.h" - -namespace FlexFlow { - -using namespace FlexFlow::Kernels::Split; -using coord_t = long long; - -enum Slots { INPUT, OUTPUT, ATTRS, PROFILING }; - -OpTaskInvocation forward(SplitAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind_arg(PROFILING, profiling_settings()); - binding.bind_arg(ATTRS, attrs); - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - - return {task_id_t::SPLIT_FWD_TASK_ID, binding}; -} - -OpTaskInvocation backward(SplitAttrs const &attrs) { - OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::SPLIT_BWD_TASK_ID, binding}; -} - -static std::pair - calc_block_size(ArrayShape const &array_shape, ff_dim_t axis) { - nonnegative_int num_blocks = 1_n; - nonnegative_int block_size = 1_n; - for (nonnegative_int d : nonnegative_range(array_shape.num_elements())) { - if (d <= axis.value) { - block_size *= array_shape.at(legion_dim_t{d}); - } else { - num_blocks *= array_shape.at(legion_dim_t{d}); - } - } - return {num_blocks, block_size}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - auto attrs = acc.get_argument(ATTRS); - - coord_t out_block_sizes[MAX_NUM_OUTPUTS]; - auto [num_blocks, in_block_size] = calc_block_size(input.shape, attrs.axis); - - for (int i = 0; i < attrs.splits.size(); i++) { - auto [_, out_block_size] = calc_block_size(output.shape, attrs.axis); - out_block_sizes[i] = out_block_size.unwrap_nonnegative(); - } - float *output_float_ptr = output.get_float_ptr(); - return profile(forward_kernel, - profiling, - "Split forward_time = {:.2lf}ms\n", - &output_float_ptr, - input.get_float_ptr(), - out_block_sizes, - in_block_size.unwrap_nonnegative(), - num_blocks.unwrap_nonnegative(), - attrs.splits.size()); -} - -// maybe we should add assert like the original code -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); - auto attrs = acc.get_argument(ATTRS); - - coord_t out_block_sizes[MAX_NUM_OUTPUTS]; - auto [num_blocks, in_block_size] = - calc_block_size(input_grad.shape, attrs.axis); - - for (int i = 0; i < attrs.splits.size(); i++) { - coord_t out_num_blocks; - auto [_, out_block_size] = calc_block_size(output_grad.shape, attrs.axis); - out_block_sizes[i] = out_block_size.unwrap_nonnegative(); - } - float const *output_grad_ptr = output_grad.get_float_ptr(); - return profile(backward_kernel, - profiling, - "Split backward_time = {:.2lf}ms\n", - input_grad.get_float_ptr(), - &output_grad_ptr, - out_block_sizes, - in_block_size.unwrap_nonnegative(), - num_blocks.unwrap_nonnegative(), - attrs.splits.size()); -} - -TaskImplFunction get_split_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} -TaskImplFunction get_split_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -OpTaskSignature get_split_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_arg_slot(PROFILING); - - fwd.add_input_slot(INPUT); - fwd.add_output_slot(OUTPUT); - return fwd; -} -OpTaskSignature get_split_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_split_fwd_signature()); - return bwd; -} - -std::vector get_task_ids(SplitAttrs const &) { - return {task_id_t::SPLIT_FWD_TASK_ID, task_id_t::SPLIT_BWD_TASK_ID}; -} - -}; // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/topk.cc b/lib/local-execution/src/local-execution/ops/topk.cc deleted file mode 100644 index fd895605a1..0000000000 --- a/lib/local-execution/src/local-execution/ops/topk.cc +++ /dev/null @@ -1,162 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "local-execution/ops/topk.h" -#include "kernels/topk_kernels.h" -#include "utils/exception.h" - -namespace FlexFlow { - -using namespace FlexFlow::Kernels::TopK; - -// For an input tensor, computes the top k entries in each row -// (resp. vector along the last dimension). Thus, -// values.shape = indices.shape = input.shape[:-1] + [k] - -enum Slots { INPUT, OUTPUT, INDICES, ATTRS, PROFILING, PER_DEVICE_STATE }; - -OpTaskInvocation init(TopKAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind_arg(ATTRS, attrs); - - return {task_id_t::TOPK_INIT_TASK_ID, binding}; -} - -OpTaskInvocation forward(TopKAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind_arg(PER_DEVICE_STATE, per_device_op_state()); - binding.bind_arg(PROFILING, profiling_settings()); - binding.bind_arg(ATTRS, attrs); - - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - binding.bind(INDICES, output_tensor(1)); - - return {task_id_t::TOPK_FWD_TASK_ID, binding}; -} - -OpTaskInvocation backward(TopKAttrs const &attrs) { - OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::TOPK_BWD_TASK_ID, binding}; -} - -static DeviceSpecificDeviceStates - init_task_impl(TaskArgumentAccessor const &acc) { - - auto attrs = acc.get_argument(ATTRS); - - TopKPerDeviceState per_device_state = init_kernel(attrs.sorted); - return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - auto attrs = acc.get_argument(ATTRS); - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); - auto profiling = acc.get_argument(PROFILING); - - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - - nonnegative_int length = input.shape.at(legion_dim_t{0_n}); - nonnegative_int batch_size = input.shape.get_volume() / length; - auto indices = acc.get_tensor(INDICES); - - return profile(forward_kernel, - profiling, - "[TopK] forward_time = {:.2lf}ms\n", - per_device_state, - input.get_float_ptr(), - output.get_float_ptr(), - indices.get_int32_ptr(), - batch_size.unwrap_nonnegative(), - length.unwrap_nonnegative(), - attrs.k.unwrap_nonnegative(), - attrs.sorted); -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - auto attrs = acc.get_argument(ATTRS); - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); - auto profiling = acc.get_argument(PROFILING); - - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); - - auto indices = acc.get_tensor(INDICES); - - nonnegative_int length = input_grad.shape.at(legion_dim_t{0_n}); - nonnegative_int batch_size = input_grad.shape.get_volume() / length; - - return profile(backward_kernel, - profiling, - "[TopK] backward_time = {:.2lf}ms\n", - per_device_state, - output_grad.get_float_ptr(), - indices.get_int32_ptr(), - input_grad.get_float_ptr(), - batch_size.unwrap_nonnegative(), - length.unwrap_nonnegative(), - attrs.k.unwrap_nonnegative()); -} - -TaskImplFunction get_topk_init_task_impl() { - return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; -} -TaskImplFunction get_topk_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} -TaskImplFunction get_topk_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -OpTaskSignature get_topk_init_signature() { - OpTaskSignature init(OpTaskType::INIT); - - init.add_arg_slot(ATTRS); - init.add_return_value(); - - return init; -} -OpTaskSignature get_topk_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_arg_slot(PROFILING); - fwd.add_arg_slot(ATTRS); - fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); - - fwd.add_input_slot(INPUT); - fwd.add_output_slot(OUTPUT); - fwd.add_output_slot(INDICES); - return fwd; -} -OpTaskSignature get_topk_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_topk_fwd_signature()); - return bwd; -} - -std::vector get_task_ids(TopKAttrs const &) { - return {task_id_t::TOPK_INIT_TASK_ID, - task_id_t::TOPK_FWD_TASK_ID, - task_id_t::TOPK_BWD_TASK_ID}; -} - -}; // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/transpose.cc b/lib/local-execution/src/local-execution/ops/transpose.cc deleted file mode 100644 index 62db7b5266..0000000000 --- a/lib/local-execution/src/local-execution/ops/transpose.cc +++ /dev/null @@ -1,107 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "local-execution/ops/transpose.h" -#include "kernels/transpose_kernels.h" -#include "op-attrs/ops/transpose.h" -#include "utils/integer_conversions.h" - -using namespace FlexFlow::Kernels::Transpose; - -namespace FlexFlow { - -enum Slots { - INPUT, // tensor - OUTPUT, // tensor - ATTRS, - PROFILING, -}; - -OpTaskInvocation forward(TransposeAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind_arg(PROFILING, profiling_settings()); - - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - - return {task_id_t::TRANSPOSE_FWD_TASK_ID, binding}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto attrs = acc.get_argument(ATTRS); - - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - - return profile(forward_kernel, - profiling, - "[Transpose] Forward_time = {:.2lf} [ms]", - attrs, - input, - output); -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto attrs = acc.get_argument(ATTRS); - - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); - - return profile(backward_kernel, - profiling, - "[Transpose] Backward_time = {:.2lf} [ms]", - attrs, - input_grad, - output_grad); -} - -OpTaskInvocation backward(TransposeAttrs const &attrs) { - OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::TRANSPOSE_BWD_TASK_ID, binding}; -} - -TaskImplFunction get_transpose_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} - -TaskImplFunction get_transpose_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -OpTaskSignature get_transpose_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_arg_slot(PROFILING); - - fwd.add_input_slot(INPUT); - fwd.add_output_slot(OUTPUT); - return fwd; -} - -OpTaskSignature get_transpose_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_transpose_fwd_signature()); - return bwd; -} - -std::vector get_task_ids(TransposeAttrs const &) { - return {task_id_t::TRANSPOSE_FWD_TASK_ID, task_id_t::TRANSPOSE_BWD_TASK_ID}; -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/ops/weight.cc b/lib/local-execution/src/local-execution/ops/weight.cc deleted file mode 100644 index f96c104f33..0000000000 --- a/lib/local-execution/src/local-execution/ops/weight.cc +++ /dev/null @@ -1,9 +0,0 @@ -#include "local-execution/ops/weight.h" - -namespace FlexFlow { - -std::vector get_task_ids(WeightAttrs const &attrs) { - return {}; -} - -}; // namespace FlexFlow diff --git a/lib/local-execution/src/local_cpu_allocator.cc b/lib/local-execution/src/local_cpu_allocator.cc deleted file mode 100644 index 4ca5f987a8..0000000000 --- a/lib/local-execution/src/local_cpu_allocator.cc +++ /dev/null @@ -1,24 +0,0 @@ -#include "local-execution/local_cpu_allocator.h" -#include "utils/containers/contains_key.h" - -namespace FlexFlow { -void *LocalCPUAllocator::allocate(size_t requested_memory_size) { - void *ptr = malloc(requested_memory_size); - this->ptrs.insert({ptr, std::unique_ptr(ptr, free)}); - return ptr; -} - -void LocalCPUAllocator::deallocate(void *ptr) { - if (contains_key(this->ptrs, ptr)) { - this->ptrs.erase(ptr); - } else { - throw std::runtime_error( - "Deallocating a pointer that was not allocated by this Allocator"); - } -} - -Allocator create_local_cpu_memory_allocator() { - return Allocator::create(); -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/local_task_argument_accessor.cc b/lib/local-execution/src/local_task_argument_accessor.cc index e53e3abeff..2e82378fdb 100644 --- a/lib/local-execution/src/local_task_argument_accessor.cc +++ b/lib/local-execution/src/local_task_argument_accessor.cc @@ -24,8 +24,8 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor( auto tensor_backing = std::get( this->tensor_slots_backing.at(slot_tensor_type)); if (priv == Permissions::RO) { - GenericTensorAccessorR readonly_tensor_backing = { - tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr}; + GenericTensorAccessorR readonly_tensor_backing = + read_only_accessor_from_write_accessor(tensor_backing); return readonly_tensor_backing; } else if (priv == Permissions::RW || priv == Permissions::WO) { return tensor_backing; @@ -33,6 +33,7 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor( throw mk_runtime_error(fmt::format("Unhandled privilege mode {}", priv)); } } + VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor( slot_id_t slot, Permissions priv, TensorType tensor_type) const { SlotTensorTypeId slot_tensor_type = SlotTensorTypeId{slot, tensor_type}; @@ -43,7 +44,7 @@ VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor( for (GenericTensorAccessorW const &tensor_backing : variadic_tensor_backing) { readonly_variadic_tensor_backing.push_back( - {tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr}); + read_only_accessor_from_write_accessor(tensor_backing)); } return readonly_variadic_tensor_backing; } else if (priv == Permissions::RW || priv == Permissions::WO) { diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc index d508c34210..3b1bb0fd2d 100644 --- a/lib/local-execution/src/local_training_backing.cc +++ b/lib/local-execution/src/local_training_backing.cc @@ -1,12 +1,12 @@ #include "local-execution/local_training_backing.h" #include "local-execution/loss_functions.h" #include "local-execution/optimizer.h" -#include "local-execution/task_signature_impl.h" #include "local-execution/unallocated_tensors.h" #include "pcg/computation_graph.h" #include "pcg/optimizer_attrs.h" #include "task-spec/op_task_to_task_invocation.h" #include "task-spec/task_invocation.h" +#include "task-spec/task_signature_impl.h" #include "utils/containers/contains.h" #include "utils/containers/contains_key.h" #include "utils/containers/get_only.h" diff --git a/lib/local-execution/src/loss_functions.cc b/lib/local-execution/src/loss_functions.cc index 15ebdd5f28..4d0b32fd48 100644 --- a/lib/local-execution/src/loss_functions.cc +++ b/lib/local-execution/src/loss_functions.cc @@ -16,6 +16,7 @@ #include "op-attrs/ops/loss_functions.h" #include "kernels/loss_function_kernels.h" #include "local-execution/loss_functions.h" +#include "kernels/format_accessor_contents.h" #include "task-spec/profiling.h" #include "utils/nonnegative_int/nonnegative_int.h" @@ -55,44 +56,41 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { auto logit_grad = acc.get_tensor_grad(LOGIT_GRAD); auto logit = acc.get_tensor(LOGIT); auto label = acc.get_loss_tensor(LABEL); - int batch_size = - logit.shape.at(legion_dim_t{nonnegative_int{1}}).unwrap_nonnegative(); + + int batch_size = logit.shape.at(legion_dim_t{1_n}).int_from_positive_int(); // assuming logit shape is [batch dim, num classes] LossFunction loss_type = get_loss_function(attrs); float scale_factor = 1.0f / batch_size; if (loss_type == LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE) { - assert(logit.shape.get_volume() == label.shape.get_volume()); - scale_factor = 2.0f / logit.shape.get_volume().unwrap_nonnegative(); + ASSERT(logit.shape.num_elements() == label.shape.num_elements()); + scale_factor = 2.0f / logit.shape.num_elements().int_from_positive_int(); } if (loss_type == LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY) { // label shape is [batch dim, 1] auto scce_attrs = attrs.get(); size_t ndim = logit.shape.num_dims().unwrap_nonnegative(); - int num_classes = - logit.shape.at(legion_dim_t{nonnegative_int{0}}).unwrap_nonnegative(); - assert(logit_grad.shape == logit.shape); + int num_classes = logit.shape.at(legion_dim_t{0_n}).int_from_positive_int(); + ASSERT(logit_grad.shape == logit.shape); int k = 1; if (scce_attrs.replace_labels) { k = logit.shape.at(legion_dim_t(nonnegative_int{ndim - 1})) - .unwrap_nonnegative() / + .int_from_positive_int() / label.shape.at(legion_dim_t(nonnegative_int{ndim - 1})) - .unwrap_nonnegative(); // TODO FIXME something seems wrong here, - // isn't the numerator guaranteed to be 1? - // <--- this is not the case because of the - // potential parallel dim + .int_from_positive_int(); // TODO FIXME something seems wrong + // here, isn't the numerator guaranteed + // to be 1? + // <--- this is not the case because of + // the potential parallel dim } - assert( - label.shape.sub_shape(legion_dim_t(nonnegative_int{1}), std::nullopt) == - logit.shape.sub_shape(legion_dim_t(nonnegative_int{1}), std::nullopt)); - assert(k * label.shape.at(legion_dim_t(nonnegative_int{ndim - 1})) - .unwrap_nonnegative() == + ASSERT(label.shape.sub_shape(legion_dim_t(1_n), std::nullopt) == + logit.shape.sub_shape(legion_dim_t(1_n), std::nullopt)); + ASSERT(k * label.shape.at(legion_dim_t(nonnegative_int{ndim - 1})) + .int_from_positive_int() == logit.shape.at(legion_dim_t(nonnegative_int{ndim - 1})) - .unwrap_nonnegative()); - assert( - label.shape.at(legion_dim_t(nonnegative_int{0})).unwrap_nonnegative() == - 1); + .int_from_positive_int()); + ASSERT(label.shape.at(legion_dim_t(0_n)).int_from_positive_int() == 1); profile(sparse_categorical_crossentropy_loss_backward_kernel, profiling, @@ -100,28 +98,34 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { get_float_ptr(logit_grad), get_float_ptr(logit), reinterpret_cast(get_float_ptr(label)), - get_volume(logit.shape).unwrap_nonnegative(), - get_volume(logit_grad.shape).unwrap_nonnegative(), + get_num_elements(logit.shape).int_from_positive_int(), + get_num_elements(logit_grad.shape).int_from_positive_int(), batch_size, num_classes, k, scale_factor); } else { - assert(logit.shape == label.shape); - assert(logit_grad.shape == logit.shape); + ASSERT(logit.shape == label.shape); + ASSERT(logit_grad.shape == logit.shape); int num_channels = - logit.shape.at(legion_dim_t{nonnegative_int{0}}).unwrap_nonnegative(); + logit.shape.at(legion_dim_t{0_n}).int_from_positive_int(); switch (loss_type) { case LossFunction::CATEGORICAL_CROSSENTROPY: { + size_t logit_volume = get_num_elements(logit.shape).int_from_positive_int(); + size_t logit_grad_volume = + get_num_elements(logit_grad.shape).int_from_positive_int(); + profile(categorical_crossentropy_loss_backward_kernel, profiling, "[CategoricalCrossEntropyLoss] backward_time = %.2lfms\n", get_float_ptr(logit_grad), get_float_ptr(logit), get_float_ptr(label), - get_volume(logit.shape).unwrap_nonnegative(), - get_volume(logit_grad.shape).unwrap_nonnegative(), + logit_volume, + logit_grad_volume, scale_factor); + + break; } case LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE: { @@ -131,8 +135,8 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { get_float_ptr(logit_grad), get_float_ptr(logit), get_float_ptr(label), - get_volume(logit.shape).unwrap_nonnegative(), - get_volume(logit_grad.shape).unwrap_nonnegative(), + get_num_elements(logit.shape).int_from_positive_int(), + get_num_elements(logit_grad.shape).int_from_positive_int(), scale_factor); break; } @@ -142,13 +146,13 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { "[IdentityLoss] backward_time = %.2lfms\n", get_float_ptr(logit_grad), get_float_ptr(logit), - get_volume(logit.shape).unwrap_nonnegative(), - get_volume(logit_grad.shape).unwrap_nonnegative(), + get_num_elements(logit.shape).int_from_positive_int(), + get_num_elements(logit_grad.shape).int_from_positive_int(), scale_factor); break; } default: - throw mk_runtime_error(fmt::format( + PANIC(fmt::format( "Unsupported loss function {}. Please report this as an issue.", loss_type)); } diff --git a/lib/local-execution/src/loss_tensor_source.cc b/lib/local-execution/src/loss_tensor_source.cc index da1efa6b85..f5ce639087 100644 --- a/lib/local-execution/src/loss_tensor_source.cc +++ b/lib/local-execution/src/loss_tensor_source.cc @@ -2,7 +2,7 @@ namespace FlexFlow { -size_t LossTensorSource::next_available_loss_tensor_id = 0; +nonnegative_int LossTensorSource::next_available_loss_tensor_id = 0_n; LossTensorSource::LossTensorSource() {} diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc index d214d0d426..d3c1c65a68 100644 --- a/lib/local-execution/src/model_training_instance.cc +++ b/lib/local-execution/src/model_training_instance.cc @@ -1,4 +1,5 @@ #include "local-execution/model_training_instance.h" +#include "kernels/format_accessor_contents.h" #include "pcg/computation_graph.h" #include "pcg/optimizer_attrs.h" #include "utils/containers/reversed.h" @@ -34,6 +35,13 @@ PerLayerElapsedTime ModelTrainingInstance::backward() { this->label_tensor, this->allocator); + gradient_tensor_t loss_tensor = + this->training_backing.local_tensor_backing.tensor_gradient_mapping.at( + this->logit_tensor); + GenericTensorAccessorW loss_tensor_backing = + this->training_backing.local_tensor_backing.tensor_backings.at( + TensorTypeVariant{loss_tensor}); + PerLayerElapsedTime per_layer_elapsed_time; for (layer_guid_t const &node : reversed( topological_ordering(this->training_backing.computation_graph))) { @@ -54,14 +62,19 @@ void ModelTrainingInstance::update() { get_optimizer_attrs_for_next_iter(this->optimizer_attrs); } -void ModelTrainingInstance::write_loss_tensor_to_host(float *host_ptr) { +GenericTensorAccessorR ModelTrainingInstance::get_loss_tensor_accessor() const { + GenericTensorAccessorW logit_tensor_backing = this->training_backing + .local_tensor_backing.tensor_backings.at(TensorTypeVariant{this->logit_tensor}); + + gradient_tensor_t loss_tensor = - this->training_backing.local_tensor_backing - .tensor_gradient_mapping.at(this->logit_tensor); + this->training_backing.local_tensor_backing.tensor_gradient_mapping.at( + this->logit_tensor); GenericTensorAccessorW loss_tensor_backing = this->training_backing.local_tensor_backing.tensor_backings.at( TensorTypeVariant{loss_tensor}); - write_to_host_float_ptr(loss_tensor_backing, host_ptr); + + return read_only_accessor_from_write_accessor(loss_tensor_backing); } } // namespace FlexFlow diff --git a/lib/local-execution/src/optimizer.cc b/lib/local-execution/src/optimizer.cc index 1b8fc37b2d..1d65172e67 100644 --- a/lib/local-execution/src/optimizer.cc +++ b/lib/local-execution/src/optimizer.cc @@ -66,18 +66,18 @@ static void sgd_update_task_impl(TaskArgumentAccessor const &acc) { auto weight = acc.get_tensor(WEIGHT); auto profiling = acc.get_argument(PROFILING); - assert(weight.shape == weight_grad.shape); - int size = weight_grad.shape.get_volume().unwrap_nonnegative(); + ASSERT(weight.shape == weight_grad.shape); + int size = weight_grad.shape.num_elements().int_from_positive_int(); - assert(weight_grad.shape.get_volume().unwrap_nonnegative() & - weight.shape.get_volume().unwrap_nonnegative()); - int num_replicas = weight_grad.shape.get_volume().unwrap_nonnegative() / - weight.shape.get_volume().unwrap_nonnegative(); + ASSERT(weight_grad.shape.num_elements().int_from_positive_int() & + weight.shape.num_elements().int_from_positive_int()); + int num_replicas = weight_grad.shape.num_elements().int_from_positive_int() / + weight.shape.num_elements().int_from_positive_int(); float *sgd_v_ptr; if (attrs.momentum > 0.0f) { auto sgd_v = acc.get_optimizer_tensor(SGD_V); - assert(sgd_v.shape == weight.shape); + ASSERT(sgd_v.shape == weight.shape); sgd_v_ptr = sgd_v.get_float_ptr(); } @@ -180,14 +180,10 @@ static void adam_update_task_impl(TaskArgumentAccessor const &acc) { auto profiling = acc.get_argument(PROFILING); - assert(weight.shape == weight_grad.shape); - int size = weight_grad.shape.get_volume().unwrap_nonnegative(); + ASSERT(weight.shape == weight_grad.shape); + int size = weight_grad.shape.num_elements().int_from_positive_int(); - assert(weight_grad.shape.get_volume().unwrap_nonnegative() % - weight.shape.get_volume().unwrap_nonnegative() == - 0); - int num_replicas = weight_grad.shape.get_volume().unwrap_nonnegative() / - weight.shape.get_volume().unwrap_nonnegative(); + ASSERT(weight_grad.shape.num_elements() % weight.shape.num_elements() == 0); auto handle = acc.get_argument(HANDLE); profile(adam_nccl_update_task_gpu, @@ -198,9 +194,9 @@ static void adam_update_task_impl(TaskArgumentAccessor const &acc) { attrs.beta2, attrs.weight_decay, attrs.epsilon, - size, handle, weight_grad.get_float_ptr(), + size, m_tensor.get_float_ptr(), v_tensor.get_float_ptr(), weight.get_float_ptr()); // how to deal with removal of ParamSync? diff --git a/lib/task-spec/src/per_device_op_state.cc b/lib/local-execution/src/per_device_op_state.cc similarity index 100% rename from lib/task-spec/src/per_device_op_state.cc rename to lib/local-execution/src/per_device_op_state.cc diff --git a/lib/local-execution/src/permissions.cc b/lib/local-execution/src/permissions.cc deleted file mode 100644 index 2286215987..0000000000 --- a/lib/local-execution/src/permissions.cc +++ /dev/null @@ -1,72 +0,0 @@ -#include "local-execution/permissions.h" -#include "utils/exception.h" - -namespace FlexFlow { - -Permissions join(Permissions lhs, Permissions rhs) { - if (lhs <= rhs) { - return rhs; - } else if (rhs <= lhs) { - return lhs; - } else { - return Permissions::RW; - } -} - -Permissions meet(Permissions lhs, Permissions rhs) { - if (lhs <= rhs) { - return lhs; - } else if (rhs <= lhs) { - return rhs; - } else { - return Permissions::NONE; - } -} - -static int as_int(Permissions p) { - switch (p) { - case Permissions::NONE: - return 0; - case Permissions::RO: - case Permissions::WO: - return 1; - case Permissions::RW: - return 2; - default: - throw mk_runtime_error( - fmt::format("Unknown permission {}", static_cast(p))); - } -} - -static bool comparable(Permissions lhs, Permissions rhs) { - return !(lhs == Permissions::RO && rhs == Permissions::WO || - lhs == Permissions::WO && rhs == Permissions::RO); -} - -bool operator<(Permissions lhs, Permissions rhs) { - if (!comparable(lhs, rhs)) { - return false; - } - int lhs_int = as_int(lhs); - int rhs_int = as_int(rhs); - return lhs_int < rhs_int; -} - -bool operator<=(Permissions lhs, Permissions rhs) { - return (lhs < rhs) || (lhs == rhs); -} - -bool operator>(Permissions lhs, Permissions rhs) { - if (!comparable(lhs, rhs)) { - return false; - } - int lhs_int = as_int(lhs); - int rhs_int = as_int(rhs); - return lhs_int > rhs_int; -} - -bool operator>=(Permissions lhs, Permissions rhs) { - return (lhs > rhs) || (lhs == rhs); -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/task_registry.cc b/lib/local-execution/src/task_registry.cc index 2787342a5f..ae3d97daa4 100644 --- a/lib/local-execution/src/task_registry.cc +++ b/lib/local-execution/src/task_registry.cc @@ -1,6 +1,6 @@ #include "local-execution/task_registry.h" -#include "local-execution/task_signature_impl.h" #include "pcg/computation_graph.h" +#include "task-spec/task_signature_impl.h" namespace FlexFlow { diff --git a/lib/local-execution/src/task_signature_impl.cc b/lib/local-execution/src/task_signature_impl.cc deleted file mode 100644 index 9031d2a015..0000000000 --- a/lib/local-execution/src/task_signature_impl.cc +++ /dev/null @@ -1,366 +0,0 @@ -#include "local-execution/task_signature_impl.h" -#include "local-execution/ops/attention.h" -#include "local-execution/ops/batch_matmul.h" -#include "local-execution/ops/batch_norm.h" -#include "local-execution/ops/cast.h" -#include "local-execution/ops/combine.h" -#include "local-execution/ops/concat.h" -#include "local-execution/ops/conv_2d.h" -#include "local-execution/ops/dropout.h" -#include "local-execution/ops/element_binary.h" -#include "local-execution/ops/element_unary.h" -#include "local-execution/ops/embedding.h" -#include "local-execution/ops/flat.h" -#include "local-execution/ops/gather.h" -#include "local-execution/ops/input.h" -#include "local-execution/ops/layer_norm.h" -#include "local-execution/ops/linear.h" -#include "local-execution/ops/noop.h" -#include "local-execution/ops/pool_2d.h" -#include "local-execution/ops/reduce.h" -#include "local-execution/ops/reduction.h" -#include "local-execution/ops/repartition.h" -#include "local-execution/ops/replicate.h" -#include "local-execution/ops/reshape.h" -#include "local-execution/ops/reverse.h" -#include "local-execution/ops/softmax.h" -#include "local-execution/ops/split.h" -#include "local-execution/ops/topk.h" -#include "local-execution/ops/transpose.h" -#include "local-execution/ops/weight.h" -#include "utils/overload.h" - -namespace FlexFlow { - -TaskSignatureAndImpl get_task_sig_impl(task_id_t const &task_id) { - switch (task_id) { - case task_id_t::ELEMENTBINARY_INIT_TASK_ID: - return TaskSignatureAndImpl{get_element_binary_init_task_impl(), - get_element_binary_init_signature()}; - case task_id_t::ELEMENTBINARY_FWD_TASK_ID: - return TaskSignatureAndImpl{get_element_binary_fwd_task_impl(), - get_element_binary_fwd_signature()}; - case task_id_t::ELEMENTBINARY_BWD_TASK_ID: - return TaskSignatureAndImpl{get_element_binary_bwd_task_impl(), - get_element_binary_bwd_signature()}; - case task_id_t::ELEMENTUNARY_INIT_TASK_ID: - return TaskSignatureAndImpl{get_element_unary_init_task_impl(), - get_element_unary_init_signature()}; - case task_id_t::ELEMENTUNARY_FWD_TASK_ID: - return TaskSignatureAndImpl{get_element_unary_fwd_task_impl(), - get_element_unary_fwd_signature()}; - case task_id_t::ELEMENTUNARY_BWD_TASK_ID: - return TaskSignatureAndImpl{get_element_unary_bwd_task_impl(), - get_element_unary_bwd_signature()}; - case task_id_t::CONV2D_INIT_TASK_ID: - return TaskSignatureAndImpl{get_conv_2d_init_task_impl(), - get_conv_2d_init_signature()}; - case task_id_t::CONV2D_FWD_TASK_ID: - return TaskSignatureAndImpl{get_conv_2d_fwd_task_impl(), - get_conv_2d_fwd_signature()}; - case task_id_t::CONV2D_BWD_TASK_ID: - return TaskSignatureAndImpl{get_conv_2d_bwd_task_impl(), - get_conv_2d_bwd_signature()}; - case task_id_t::DROPOUT_INIT_TASK_ID: - return TaskSignatureAndImpl{get_dropout_init_task_impl(), - get_dropout_init_signature()}; - case task_id_t::DROPOUT_FWD_TASK_ID: - return TaskSignatureAndImpl{get_dropout_fwd_task_impl(), - get_dropout_fwd_signature()}; - case task_id_t::DROPOUT_BWD_TASK_ID: - return TaskSignatureAndImpl{get_dropout_bwd_task_impl(), - get_dropout_bwd_signature()}; - // case task_id_t::EMBED_FWD_TASK_ID: - // return TaskSignatureAndImpl{get_embedding_fwd_task_impl(), - // get_embedding_fwd_signature()}; - // case task_id_t::EMBED_BWD_TASK_ID: - // return TaskSignatureAndImpl{get_embedding_bwd_task_impl(), - // get_embedding_bwd_signature()}; - case task_id_t::GATHER_INIT_TASK_ID: - return TaskSignatureAndImpl{get_gather_init_task_impl(), - get_gather_init_signature()}; - case task_id_t::GATHER_FWD_TASK_ID: - return TaskSignatureAndImpl{get_gather_fwd_task_impl(), - get_gather_fwd_signature()}; - case task_id_t::GATHER_BWD_TASK_ID: - return TaskSignatureAndImpl{get_gather_bwd_task_impl(), - get_gather_bwd_signature()}; - case task_id_t::CAST_FWD_TASK_ID: - return TaskSignatureAndImpl{get_cast_fwd_task_impl(), - get_cast_fwd_signature()}; - case task_id_t::CAST_BWD_TASK_ID: - return TaskSignatureAndImpl{get_cast_bwd_task_impl(), - get_cast_bwd_signature()}; - case task_id_t::POOL2D_INIT_TASK_ID: - return TaskSignatureAndImpl{get_pool_2d_init_task_impl(), - get_pool_2d_init_signature()}; - case task_id_t::POOL2D_FWD_TASK_ID: - return TaskSignatureAndImpl{get_pool_2d_fwd_task_impl(), - get_pool_2d_fwd_signature()}; - case task_id_t::POOL2D_BWD_TASK_ID: - return TaskSignatureAndImpl{get_pool_2d_bwd_task_impl(), - get_pool_2d_bwd_signature()}; - case task_id_t::BATCHNORM_INIT_TASK_ID: - return TaskSignatureAndImpl{get_batch_norm_init_task_impl(), - get_batch_norm_init_signature()}; - case task_id_t::BATCHNORM_FWD_TASK_ID: - return TaskSignatureAndImpl{get_batch_norm_fwd_task_impl(), - get_batch_norm_fwd_signature()}; - case task_id_t::BATCHNORM_BWD_TASK_ID: - return TaskSignatureAndImpl{get_batch_norm_bwd_task_impl(), - get_batch_norm_bwd_signature()}; - case task_id_t::BATCHMATMUL_FWD_TASK_ID: - return TaskSignatureAndImpl{get_batch_matmul_fwd_task_impl(), - get_batch_matmul_fwd_signature()}; - case task_id_t::BATCHMATMUL_BWD_TASK_ID: - return TaskSignatureAndImpl{get_batch_matmul_bwd_task_impl(), - get_batch_matmul_bwd_signature()}; - case task_id_t::LAYERNORM_INIT_TASK_ID: - return TaskSignatureAndImpl{get_layer_norm_init_task_impl(), - get_layer_norm_init_signature()}; - case task_id_t::LAYERNORM_FWD_TASK_ID: - return TaskSignatureAndImpl{get_layer_norm_fwd_task_impl(), - get_layer_norm_init_signature()}; - case task_id_t::LAYERNORM_BWD_TASK_ID: - return TaskSignatureAndImpl{get_layer_norm_bwd_task_impl(), - get_layer_norm_bwd_signature()}; - case task_id_t::LINEAR_INIT_TASK_ID: - return TaskSignatureAndImpl{get_linear_init_task_impl(), - get_linear_init_signature()}; - case task_id_t::LINEAR_FWD_TASK_ID: - return TaskSignatureAndImpl{get_linear_fwd_task_impl(), - get_linear_fwd_signature()}; - case task_id_t::LINEAR_BWD_TASK_ID: - return TaskSignatureAndImpl{get_linear_bwd_task_impl(), - get_linear_bwd_signature()}; - case task_id_t::FLAT_FWD_TASK_ID: - return TaskSignatureAndImpl{get_flat_fwd_task_impl(), - get_flat_fwd_signature()}; - case task_id_t::FLAT_BWD_TASK_ID: - return TaskSignatureAndImpl{get_flat_bwd_task_impl(), - get_flat_bwd_signature()}; - case task_id_t::SOFTMAX_INIT_TASK_ID: - return TaskSignatureAndImpl{get_softmax_init_task_impl(), - get_softmax_init_signature()}; - case task_id_t::SOFTMAX_FWD_TASK_ID: - return TaskSignatureAndImpl{get_softmax_fwd_task_impl(), - get_softmax_fwd_signature()}; - case task_id_t::SOFTMAX_BWD_TASK_ID: - return TaskSignatureAndImpl{get_softmax_bwd_task_impl(), - get_softmax_bwd_signature()}; - case task_id_t::CONCAT_FWD_TASK_ID: - return TaskSignatureAndImpl{get_concat_fwd_task_impl(), - get_concat_fwd_signature()}; - case task_id_t::CONCAT_BWD_TASK_ID: - return TaskSignatureAndImpl{get_concat_bwd_task_impl(), - get_concat_bwd_signature()}; - case task_id_t::SPLIT_FWD_TASK_ID: - return TaskSignatureAndImpl{get_split_fwd_task_impl(), - get_split_fwd_signature()}; - case task_id_t::SPLIT_BWD_TASK_ID: - return TaskSignatureAndImpl{get_split_bwd_task_impl(), - get_split_bwd_signature()}; - case task_id_t::REDUCE_INIT_TASK_ID: - return TaskSignatureAndImpl{get_reduce_init_task_impl(), - get_reduce_init_signature()}; - case task_id_t::REDUCE_FWD_TASK_ID: - return TaskSignatureAndImpl{get_reduce_fwd_task_impl(), - get_reduce_fwd_signature()}; - case task_id_t::REDUCE_BWD_TASK_ID: - return TaskSignatureAndImpl{get_reduce_bwd_task_impl(), - get_reduce_bwd_signature()}; - case task_id_t::RESHAPE_INIT_TASK_ID: - return TaskSignatureAndImpl{get_reshape_init_task_impl(), - get_reshape_init_signature()}; - case task_id_t::RESHAPE_FWD_TASK_ID: - return TaskSignatureAndImpl{get_reshape_fwd_task_impl(), - get_reshape_fwd_signature()}; - case task_id_t::RESHAPE_BWD_TASK_ID: - return TaskSignatureAndImpl{get_reshape_bwd_task_impl(), - get_reshape_bwd_signature()}; - case task_id_t::REVERSE_FWD_TASK_ID: - return TaskSignatureAndImpl{get_reverse_fwd_task_impl(), - get_reverse_fwd_signature()}; - case task_id_t::REVERSE_BWD_TASK_ID: - return TaskSignatureAndImpl{get_reverse_bwd_task_impl(), - get_reverse_bwd_signature()}; - case task_id_t::TOPK_INIT_TASK_ID: - return TaskSignatureAndImpl{get_topk_init_task_impl(), - get_topk_init_signature()}; - case task_id_t::TOPK_FWD_TASK_ID: - return TaskSignatureAndImpl{get_topk_fwd_task_impl(), - get_topk_fwd_signature()}; - case task_id_t::TOPK_BWD_TASK_ID: - return TaskSignatureAndImpl{get_topk_bwd_task_impl(), - get_topk_bwd_signature()}; - case task_id_t::TRANSPOSE_FWD_TASK_ID: - return TaskSignatureAndImpl{get_transpose_fwd_task_impl(), - get_transpose_fwd_signature()}; - case task_id_t::TRANSPOSE_BWD_TASK_ID: - return TaskSignatureAndImpl{get_transpose_bwd_task_impl(), - get_transpose_bwd_signature()}; - case task_id_t::ATTENTION_INIT_TASK_ID: - return TaskSignatureAndImpl{get_attention_init_task_impl(), - get_attention_init_signature()}; - case task_id_t::ATTENTION_FWD_TASK_ID: - return TaskSignatureAndImpl{get_attention_fwd_task_impl(), - get_attention_fwd_signature()}; - case task_id_t::ATTENTION_BWD_TASK_ID: - return TaskSignatureAndImpl{get_attention_bwd_task_impl(), - get_attention_bwd_signature()}; - case task_id_t::COMBINE_FWD_TASK_ID: - return TaskSignatureAndImpl{get_combine_fwd_task_impl(), - get_combine_fwd_signature()}; - case task_id_t::COMBINE_BWD_TASK_ID: - return TaskSignatureAndImpl{get_combine_bwd_task_impl(), - get_combine_bwd_signature()}; - case task_id_t::REDUCTION_FWD_TASK_ID: - return TaskSignatureAndImpl{get_reduction_fwd_task_impl(), - get_reduction_fwd_signature()}; - case task_id_t::REDUCTION_BWD_TASK_ID: - return TaskSignatureAndImpl{get_reduction_bwd_task_impl(), - get_reduction_bwd_signature()}; - case task_id_t::REPARTITION_INIT_TASK_ID: - return TaskSignatureAndImpl{get_repartition_init_task_impl(), - get_repartition_init_signature()}; - case task_id_t::REPARTITION_FWD_TASK_ID: - return TaskSignatureAndImpl{get_repartition_fwd_task_impl(), - get_repartition_fwd_signature()}; - case task_id_t::REPARTITION_BWD_TASK_ID: - return TaskSignatureAndImpl{get_repartition_bwd_task_impl(), - get_repartition_bwd_signature()}; - case task_id_t::REPLICATE_FWD_TASK_ID: - return TaskSignatureAndImpl{get_replicate_fwd_task_impl(), - get_replicate_fwd_signature()}; - case task_id_t::REPLICATE_BWD_TASK_ID: - return TaskSignatureAndImpl{get_replicate_bwd_task_impl(), - get_replicate_bwd_signature()}; - default: - throw mk_runtime_error( - fmt::format("Invalid task ID")); // inserting task_id yields - // "type_is_unformattable" error - } -} - -std::vector get_task_ids(ComputationGraphOpAttrs const &op) { - return op.visit>(overload{ - [](BatchMatmulAttrs const &attrs) { return get_task_ids(attrs); }, - [](BatchNormAttrs const &attrs) { return get_task_ids(attrs); }, - [](CastAttrs const &attrs) { return get_task_ids(attrs); }, - [](ConcatAttrs const &attrs) { return get_task_ids(attrs); }, - [](Conv2DAttrs const &attrs) { return get_task_ids(attrs); }, - [](DropoutAttrs const &attrs) { return get_task_ids(attrs); }, - [](ElementBinaryAttrs const &attrs) { return get_task_ids(attrs); }, - [](ElementUnaryAttrs const &attrs) { return get_task_ids(attrs); }, - // [](EmbeddingAttrs const & attrs) { - // return get_task_ids(attrs); - // }, - [](FlatAttrs const &attrs) { return get_task_ids(attrs); }, - [](GatherAttrs const &attrs) { return get_task_ids(attrs); }, - [](InputAttrs const &attrs) { return get_task_ids(attrs); }, - [](LayerNormAttrs const &attrs) { return get_task_ids(attrs); }, - [](LinearAttrs const &attrs) { return get_task_ids(attrs); }, - [](MultiHeadAttentionAttrs const &attrs) { return get_task_ids(attrs); }, - [](NoopAttrs const &attrs) { return get_task_ids(attrs); }, - [](Pool2DAttrs const &attrs) { return get_task_ids(attrs); }, - [](ReduceAttrs const &attrs) { return get_task_ids(attrs); }, - [](ReverseAttrs const &attrs) { return get_task_ids(attrs); }, - [](ReshapeAttrs const &attrs) { return get_task_ids(attrs); }, - [](SplitAttrs const &attrs) { return get_task_ids(attrs); }, - [](SoftmaxAttrs const &attrs) { return get_task_ids(attrs); }, - [](TopKAttrs const &attrs) { return get_task_ids(attrs); }, - [](TransposeAttrs const &attrs) { return get_task_ids(attrs); }, - [](WeightAttrs const &attrs) { return get_task_ids(attrs); }, - [](auto const &attrs) -> std::vector { - throw mk_runtime_error(fmt::format("Unhandled attr type: {}", attrs)); - }, - }); -} - -OpTaskInvocation init(ComputationGraphOpAttrs const &op) { - return op.visit(overload{ - [](BatchNormAttrs const &attrs) { return init(attrs); }, - [](Conv2DAttrs const &attrs) { return init(attrs); }, - [](DropoutAttrs const &attrs) { return init(attrs); }, - [](ElementBinaryAttrs const &attrs) { return init(attrs); }, - [](ElementUnaryAttrs const &attrs) { return init(attrs); }, - [](GatherAttrs const &attrs) { return init(attrs); }, - [](LayerNormAttrs const &attrs) { return init(attrs); }, - [](LinearAttrs const &attrs) { return init(attrs); }, - [](MultiHeadAttentionAttrs const &attrs) { return init(attrs); }, - [](Pool2DAttrs const &attrs) { return init(attrs); }, - [](ReduceAttrs const &attrs) { return init(attrs); }, - [](ReshapeAttrs const &attrs) { return init(attrs); }, - [](SoftmaxAttrs const &attrs) { return init(attrs); }, - [](TopKAttrs const &attrs) { return init(attrs); }, - [](auto const &attrs) -> OpTaskInvocation { - throw mk_runtime_error(fmt::format("Unhandled attr type {}", attrs)); - }, - }); -} - -OpTaskInvocation forward(ComputationGraphOpAttrs const &op) { - return op.visit(overload{ - [](BatchMatmulAttrs const &attrs) { return forward(attrs); }, - [](BatchNormAttrs const &attrs) { return forward(attrs); }, - [](CastAttrs const &attrs) { return forward(attrs); }, - [](ConcatAttrs const &attrs) { return forward(attrs); }, - [](Conv2DAttrs const &attrs) { return forward(attrs); }, - [](DropoutAttrs const &attrs) { return forward(attrs); }, - [](ElementBinaryAttrs const &attrs) { return forward(attrs); }, - [](ElementUnaryAttrs const &attrs) { return forward(attrs); }, - // [](EmbeddingAttrs const & attrs) { - // return forward(attrs); - // }, - [](FlatAttrs const &attrs) { return forward(attrs); }, - [](GatherAttrs const &attrs) { return forward(attrs); }, - [](LayerNormAttrs const &attrs) { return forward(attrs); }, - [](LinearAttrs const &attrs) { return forward(attrs); }, - [](MultiHeadAttentionAttrs const &attrs) { return forward(attrs); }, - [](Pool2DAttrs const &attrs) { return forward(attrs); }, - [](ReduceAttrs const &attrs) { return forward(attrs); }, - [](ReverseAttrs const &attrs) { return forward(attrs); }, - [](ReshapeAttrs const &attrs) { return forward(attrs); }, - [](SplitAttrs const &attrs) { return forward(attrs); }, - [](SoftmaxAttrs const &attrs) { return forward(attrs); }, - [](TopKAttrs const &attrs) { return forward(attrs); }, - [](TransposeAttrs const &attrs) { return forward(attrs); }, - [](auto const &attrs) -> OpTaskInvocation { - throw mk_runtime_error(fmt::format("Unhandled attr type {}", attrs)); - }, - }); -} - -OpTaskInvocation backward(ComputationGraphOpAttrs const &op) { - return op.visit(overload{ - [](BatchMatmulAttrs const &attrs) { return backward(attrs); }, - [](BatchNormAttrs const &attrs) { return backward(attrs); }, - [](CastAttrs const &attrs) { return backward(attrs); }, - [](ConcatAttrs const &attrs) { return backward(attrs); }, - [](Conv2DAttrs const &attrs) { return backward(attrs); }, - [](DropoutAttrs const &attrs) { return backward(attrs); }, - [](ElementBinaryAttrs const &attrs) { return backward(attrs); }, - [](ElementUnaryAttrs const &attrs) { return backward(attrs); }, - // [](EmbeddingAttrs const & attrs) { - // return backward(attrs); - // }, - [](FlatAttrs const &attrs) { return backward(attrs); }, - [](GatherAttrs const &attrs) { return backward(attrs); }, - [](LayerNormAttrs const &attrs) { return backward(attrs); }, - [](LinearAttrs const &attrs) { return backward(attrs); }, - [](MultiHeadAttentionAttrs const &attrs) { return backward(attrs); }, - [](Pool2DAttrs const &attrs) { return backward(attrs); }, - [](ReduceAttrs const &attrs) { return backward(attrs); }, - [](ReverseAttrs const &attrs) { return backward(attrs); }, - [](ReshapeAttrs const &attrs) { return backward(attrs); }, - [](SplitAttrs const &attrs) { return backward(attrs); }, - [](SoftmaxAttrs const &attrs) { return backward(attrs); }, - [](TopKAttrs const &attrs) { return backward(attrs); }, - [](TransposeAttrs const &attrs) { return backward(attrs); }, - [](auto const &attrs) -> OpTaskInvocation { - throw mk_runtime_error(fmt::format("Unhandled attr type {}", attrs)); - }, - }); -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc index e6c3a11711..ed181aea32 100644 --- a/lib/local-execution/src/tracked_allocator.cc +++ b/lib/local-execution/src/tracked_allocator.cc @@ -23,8 +23,13 @@ size_t TrackedAllocator::get_current_mem_usage() { return this->current_mem_usage; } +DeviceType TrackedAllocator::get_allocation_device_type() const { + return this->allocator.get_allocation_device_type(); +} + Allocator get_tracked_memory_allocator(Allocator const &base_allocator) { - return Allocator::create(base_allocator); + Allocator allocator = Allocator::create(base_allocator); + return allocator; } } // namespace FlexFlow diff --git a/lib/local-execution/src/unallocated_tensors.cc b/lib/local-execution/src/unallocated_tensors.cc index 363d1eedef..b8daa90e3b 100644 --- a/lib/local-execution/src/unallocated_tensors.cc +++ b/lib/local-execution/src/unallocated_tensors.cc @@ -70,7 +70,6 @@ UnallocatedTensors generate_unallocated_tensors_with_optimizer( num_optimizer_tensors_to_allocate -= allocated_tensors.optimizer_mapping.at(tensor_guid).size(); } - std::cout << num_optimizer_tensors_to_allocate; for (int i = 0; i < num_optimizer_tensors_to_allocate; ++i) { optimizer_tensor_t optimizer_tensor = diff --git a/lib/local-execution/test/CMakeLists.txt b/lib/local-execution/test/CMakeLists.txt index a973c6967b..0e79376575 100644 --- a/lib/local-execution/test/CMakeLists.txt +++ b/lib/local-execution/test/CMakeLists.txt @@ -11,11 +11,6 @@ ff_add_test_executable( local-execution kernels op-attrs + task-spec ) -set(FF_TEST_EXEC_NAME "local-execution-tests") -add_custom_command( - TARGET ${FF_TEST_EXEC_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -DFF_TEST_EXEC_NAME=${FF_TEST_EXEC_NAME} -P ${CMAKE_CURRENT_LIST_DIR}/modify_test_commands.cmake - DEPENDS ${FF_TEST_EXEC_NAME} -) diff --git a/lib/local-execution/test/modify_test_commands.cmake b/lib/local-execution/test/modify_test_commands.cmake deleted file mode 100644 index 6494ae2d78..0000000000 --- a/lib/local-execution/test/modify_test_commands.cmake +++ /dev/null @@ -1,21 +0,0 @@ -# modify_test_commands.cmake - -file(GLOB ctest_tests_files "${CMAKE_CURRENT_BINARY_DIR}/${FF_TEST_EXEC_NAME}_tests-*.cmake") - -foreach(ctest_tests_file IN LISTS ctest_tests_files) - file(READ "${ctest_tests_file}" content) - - # add nix run prefix - string(REGEX REPLACE - "add_test\\([ \t\r\n]*\\[==\\[([^]]+)\\]==\\][ \t\r\n]+([^ ]+)[ \t\r\n]+\\[==\\[([^]]+)\\]==\\]\\)" - "add_test( [==[\\1]==] nixGL -- \\2 [==[\\3]==])" - content "${content}") - - # add environment - # string(REGEX REPLACE - # "set_tests_properties\\([ \t\r\n]*\\[==\\[([^]]+)\\]==\\][ \t\r\n]+PROPERTIES[ \t\r\n]+([^)]+)\\)" - # "set_tests_properties( [==[\\1]==] PROPERTIES \\2 ENVIRONMENT \"NIXPKGS_ALLOW_UNFREE=1\")" - # content "${content}") - - file(WRITE "${ctest_tests_file}" "${content}") -endforeach() diff --git a/lib/local-execution/test/src/test_allocated_tensors.cc b/lib/local-execution/test/src/test_allocated_tensors.cc index 45fc8e0a1c..3242ca79ad 100644 --- a/lib/local-execution/test/src/test_allocated_tensors.cc +++ b/lib/local-execution/test/src/test_allocated_tensors.cc @@ -1,6 +1,6 @@ +#include "kernels/local_cpu_allocator.h" #include "local-execution/allocated_tensors.h" #include "local-execution/gradient_tensor_source.h" -#include "local-execution/local_cpu_allocator.h" #include "local-execution/loss_tensor_source.h" #include "local-execution/optimizer_tensor_source.h" #include "pcg/computation_graph.dtg.h" @@ -29,16 +29,13 @@ TEST_SUITE(FF_TEST_SUITE) { tensor_guid_t dangling_tensor = tensor_guid_source.new_mock_tensor_guid(); TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_n, 10_n}}, - DataType::FLOAT}, + TensorShape{TensorDims{FFOrdered{16_p, 10_p}}, DataType::FLOAT}, CreateGrad::NO}; TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_n, 20_n}}, - DataType::FLOAT}, + TensorShape{TensorDims{FFOrdered{16_p, 20_p}}, DataType::FLOAT}, CreateGrad::NO}; TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_n, 30_n}}, - DataType::FLOAT}, + TensorShape{TensorDims{FFOrdered{16_p, 30_p}}, DataType::FLOAT}, CreateGrad::YES}; GenericTensorAccessorW tensor_backing_1 = diff --git a/lib/local-execution/test/src/test_e2e.cc b/lib/local-execution/test/src/test_e2e.cc index ccad60a900..de759e2e01 100644 --- a/lib/local-execution/test/src/test_e2e.cc +++ b/lib/local-execution/test/src/test_e2e.cc @@ -1,6 +1,7 @@ -#include "kernels/local_cuda_allocator.h" -#include "kernels/managed_ff_stream.h" -#include "kernels/managed_per_device_ff_handle.h" +#include "kernels/compare_tensor_accessors.h" +#include "kernels/format_accessor_contents.h" +#include "kernels/tensor_accessor_reductions.h" +#include "kernels/test_utils.h" #include "local-execution/allocated_tensors.h" #include "local-execution/local_training_backing.h" #include "local-execution/model_training_instance.h" @@ -14,20 +15,21 @@ using namespace ::FlexFlow; -bool did_loss_decrease(float *first_epoch, float *last_epoch, int batch_size) { - for (int i = 0; i < batch_size; i++) { - if (first_epoch[i] < last_epoch[i]) { - return false; - } - } - return true; +bool did_loss_decrease(GenericTensorAccessorR const &first_epoch, + GenericTensorAccessorR const &last_epoch) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + return tensor_accessor_all( + compare_tensor_accessors_le(last_epoch, first_epoch, cpu_allocator)); } -TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("E2ETest") { +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("LocalBackend e2e Training") { // initialize runtime ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); Allocator allocator = create_local_cuda_memory_allocator(); @@ -35,36 +37,38 @@ TEST_SUITE(FF_TEST_SUITE) { LossTensorSource loss_tensor_source; loss_tensor_t label_tensor = loss_tensor_source.new_loss_tensor(); - nonnegative_int batch_size = 10_n; - nonnegative_int data_dim = 16_n; - nonnegative_int hidden_dim = 32_n; - nonnegative_int output_dim = 1_n; + positive_int batch_size = 10_p; + positive_int data_dim = 16_p; + positive_int hidden_dim = 32_p; + positive_int output_dim = 1_p; + TensorShape input_tensor_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; TensorShape output_tensor_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, output_dim}}, - DataType::FLOAT}; + TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT}; - GenericTensorAccessorW label_tensor_backing = - allocator.allocate_tensor(output_tensor_shape); - AllocatedTensors allocated_tensors = AllocatedTensors{ - {{TensorTypeVariant{label_tensor}, label_tensor_backing}}, {}, {}}; + GenericTensorAccessorW label_tensor_backing = create_random_filled_accessor_w( + output_tensor_shape, allocator); // construct computation graph ComputationGraph computation_graph = make_empty_computation_graph(); - TensorShape input_tensor_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, data_dim}}, - DataType::FLOAT}; TensorShape weight_shape_1 = TensorShape{ - TensorDims{FFOrdered{data_dim, hidden_dim}}, - DataType::FLOAT}; + TensorDims{FFOrdered{data_dim, hidden_dim}}, DataType::FLOAT}; TensorShape weight_shape_2 = TensorShape{ - TensorDims{FFOrdered{hidden_dim, output_dim}}, - DataType::FLOAT}; + TensorDims{FFOrdered{hidden_dim, output_dim}}, DataType::FLOAT}; + + GenericTensorAccessorW weight_1_backing = create_random_filled_accessor_w( + weight_shape_1, allocator); + GenericTensorAccessorW weight_2_backing = create_random_filled_accessor_w( + weight_shape_2, allocator); LayerAddedResult inputs_layer = add_input_layer_with_grad(computation_graph, input_tensor_shape); + tensor_guid_t input_tensor_guid = get_only(inputs_layer.outputs); + GenericTensorAccessorW input_tensor_backing = create_random_filled_accessor_w( + input_tensor_shape, allocator); LayerAddedResult weights_layer_1 = add_layer( computation_graph, @@ -73,6 +77,7 @@ TEST_SUITE(FF_TEST_SUITE) { std::nullopt}, {}, {}); + tensor_guid_t weight_1_tensor_guid = get_only(weights_layer_1.outputs); LayerAddedResult weights_layer_2 = add_layer( computation_graph, @@ -81,13 +86,14 @@ TEST_SUITE(FF_TEST_SUITE) { std::nullopt}, {}, {}); + tensor_guid_t weight_2_tensor_guid = get_only(weights_layer_2.outputs); LayerAddedResult linear_operator_1 = add_layer( computation_graph, LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{hidden_dim, /*use_bias=*/false, DataType::FLOAT, - Activation::RELU, + std::nullopt, std::nullopt}}, std::nullopt}, inputs_layer.outputs, @@ -98,7 +104,7 @@ TEST_SUITE(FF_TEST_SUITE) { LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim, /*use_bias=*/false, DataType::FLOAT, - Activation::RELU, + std::nullopt, std::nullopt}}, std::nullopt}, linear_operator_1.outputs, @@ -123,6 +129,17 @@ TEST_SUITE(FF_TEST_SUITE) { GradientTensorSource gradient_tensor_source; OptimizerTensorSource optimizer_tensor_source; + AllocatedTensors allocated_tensors = AllocatedTensors{ + /*tensor_type_backings=*/{ + {TensorTypeVariant{label_tensor}, label_tensor_backing}, + {TensorTypeVariant{input_tensor_guid}, input_tensor_backing}, + {TensorTypeVariant{weight_1_tensor_guid}, weight_1_backing}, + {TensorTypeVariant{weight_2_tensor_guid}, weight_2_backing}, + }, + /*gradient_mapping=*/{}, + /*optimizer_mapping*/ {}, + }; + LocalTrainingBacking local_training_backing = LocalTrainingBacking{allocator, allocated_tensors, @@ -141,28 +158,25 @@ TEST_SUITE(FF_TEST_SUITE) { loss_attrs, optimizer_attrs}; + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + int num_epochs = 5; - int num_samples = batch_size.unwrap_nonnegative(); - std::vector loss_values(num_epochs); + std::vector loss_values; for (int i = 0; i < num_epochs; i++) { model_training_instance.forward(); model_training_instance.backward(); model_training_instance.update(); - float *host_loss_ptr = new float[num_samples]; - model_training_instance.write_loss_tensor_to_host(host_loss_ptr); - loss_values[i] = host_loss_ptr; + loss_values.push_back(copy_tensor_accessor_r( + model_training_instance.get_loss_tensor_accessor(), cpu_allocator)); } // Assert that each sample in the batch has a lower loss in last epoch than // the first epoch - float *first_epoch = loss_values[0]; - float *last_epoch = loss_values[num_epochs - 1]; - CHECK(did_loss_decrease( - first_epoch, last_epoch, batch_size.unwrap_nonnegative())); + GenericTensorAccessorR first_epoch_loss = loss_values.at(0); + + GenericTensorAccessorR last_epoch = loss_values.back(); - for (int i = 0; i < num_epochs; i++) { - delete[] loss_values[i]; - } + CHECK(did_loss_decrease(first_epoch_loss, last_epoch)); } -} \ No newline at end of file +} diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc index c9c5afe04e..42b88aa6bc 100644 --- a/lib/local-execution/test/src/test_local_cost_estimator.cc +++ b/lib/local-execution/test/src/test_local_cost_estimator.cc @@ -9,10 +9,11 @@ using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("LocalCostEstimator") { - // local backing initialization - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ DeviceSpecific::create(managed_handle.raw_handle()), @@ -23,8 +24,8 @@ TEST_SUITE(FF_TEST_SUITE) { LocalCostEstimator cost_estimator = LocalCostEstimator{runtime_arg_config}; SUBCASE("Estimate cost -- Attention Op") { - nonnegative_int embed_dim = 32_n; - nonnegative_int num_heads = 10_n; + positive_int embed_dim = 32_p; + positive_int num_heads = 10_p; MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{ /*embed_dim=*/embed_dim, /*num_heads=*/num_heads, @@ -36,14 +37,14 @@ TEST_SUITE(FF_TEST_SUITE) { /*add_zero_attn=*/false, }; - nonnegative_int batch_size = 40_n; - nonnegative_int seq_len = 48_n; - nonnegative_int feature_size = 36_n; + positive_int batch_size = 40_p; + positive_int seq_len = 48_p; + positive_int feature_size = 36_p; DataType dtype = DataType::FLOAT; ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{ TensorDims{ - FFOrdered{batch_size, seq_len, feature_size}}, + FFOrdered{batch_size, seq_len, feature_size}}, DataType::FLOAT, }); @@ -66,7 +67,7 @@ TEST_SUITE(FF_TEST_SUITE) { make_1d_machine_view( MachineSpaceCoordinate{0_n, 0_n, DeviceType::GPU}, MachineSpecificationDimension::INTRA_NODE, - stride_t{0_n})); + stride_t{1_p})); CHECK(result.total_elapsed_time > 0); CHECK(result.total_mem_usage > 0); diff --git a/lib/local-execution/test/src/test_local_task_arg_accessor.cc b/lib/local-execution/test/src/test_local_task_arg_accessor.cc index f7e9da08ed..5c11010e2a 100644 --- a/lib/local-execution/test/src/test_local_task_arg_accessor.cc +++ b/lib/local-execution/test/src/test_local_task_arg_accessor.cc @@ -1,7 +1,7 @@ #include "doctest/doctest.h" -#include "local-execution/local_cpu_allocator.h" +#include "kernels/local_cpu_allocator.h" #include "local-execution/local_task_argument_accessor.h" -#include "local-execution/task_signature_impl.h" +#include "task-spec/task_signature_impl.h" #include "utils/fmt/variant.h" using namespace ::FlexFlow; @@ -9,17 +9,16 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("LocalTaskArgumentAccessor") { Allocator allocator = create_local_cpu_memory_allocator(); - nonnegative_int embed_dim = 32_n; - nonnegative_int num_heads = 10_n; + positive_int embed_dim = 32_p; + positive_int num_heads = 10_p; - nonnegative_int batch_size = 40_n; - nonnegative_int seq_len = 48_n; - nonnegative_int feature_size = 36_n; + positive_int batch_size = 40_p; + positive_int seq_len = 48_p; + positive_int feature_size = 36_p; DataType dtype = DataType::FLOAT; TensorShape input_tensor_shape = TensorShape{ - TensorDims{ - FFOrdered{batch_size, seq_len, feature_size}}, + TensorDims{FFOrdered{batch_size, seq_len, feature_size}}, DataType::FLOAT, }; diff --git a/lib/local-execution/test/src/test_local_tensor_backing.cc b/lib/local-execution/test/src/test_local_tensor_backing.cc index 594051c2f1..bba0bd28ce 100644 --- a/lib/local-execution/test/src/test_local_tensor_backing.cc +++ b/lib/local-execution/test/src/test_local_tensor_backing.cc @@ -1,4 +1,4 @@ -#include "local-execution/local_cpu_allocator.h" +#include "kernels/local_cpu_allocator.h" #include "local-execution/local_tensor_backing.h" #include "test_utils.h" #include "utils/containers/keys.h" @@ -94,12 +94,10 @@ TEST_SUITE(FF_TEST_SUITE) { tensor_guid_source.new_mock_tensor_guid(); TensorAttrs allocated_tensor_attrs = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_n, 10_n}}, - DataType::FLOAT}, + TensorShape{TensorDims{FFOrdered{16_p, 10_p}}, DataType::FLOAT}, CreateGrad::NO}; TensorAttrs unallocated_tensor_attrs = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_n, 20_n}}, - DataType::FLOAT}, + TensorShape{TensorDims{FFOrdered{16_p, 20_p}}, DataType::FLOAT}, CreateGrad::YES}; GenericTensorAccessorW allocated_tensor_backing = diff --git a/lib/local-execution/test/src/test_loss_functions.cc b/lib/local-execution/test/src/test_loss_functions.cc index ca2482653b..d741d4d8d4 100644 --- a/lib/local-execution/test/src/test_loss_functions.cc +++ b/lib/local-execution/test/src/test_loss_functions.cc @@ -13,11 +13,13 @@ using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("LossFunctions") { // initialize runtime ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); Allocator allocator = create_local_cuda_memory_allocator(); @@ -28,16 +30,14 @@ TEST_SUITE(FF_TEST_SUITE) { loss_tensor_t label_for_sparse_cce_loss_attrs = loss_tensor_source.new_loss_tensor(); - nonnegative_int batch_size = 10_n; - nonnegative_int data_dim = 16_n; - nonnegative_int output_dim = 32_n; + positive_int batch_size = 10_p; + positive_int data_dim = 16_p; + positive_int output_dim = 32_p; TensorShape output_tensor_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, output_dim}}, - DataType::FLOAT}; + TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT}; TensorShape reduced_tensor_shape = - TensorShape{TensorDims{FFOrdered{batch_size, 1_n}}, - DataType::FLOAT}; + TensorShape{TensorDims{FFOrdered{batch_size, 1_p}}, DataType::FLOAT}; GenericTensorAccessorW label_for_nonconfigurable_loss_attrs_backing = allocator.allocate_tensor(output_tensor_shape); @@ -55,12 +55,10 @@ TEST_SUITE(FF_TEST_SUITE) { ComputationGraph computation_graph = make_empty_computation_graph(); TensorShape input_tensor_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, data_dim}}, - DataType::FLOAT}; + TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; TensorShape weight_shape = TensorShape{ - TensorDims{FFOrdered{data_dim, output_dim}}, - DataType::FLOAT}; + TensorDims{FFOrdered{data_dim, output_dim}}, DataType::FLOAT}; LayerAddedResult inputs_layer = add_input_layer(computation_graph, input_tensor_shape); diff --git a/lib/local-execution/test/src/test_task_registry.cc b/lib/local-execution/test/src/test_task_registry.cc index 16877b0e09..4bcfa7fe17 100644 --- a/lib/local-execution/test/src/test_task_registry.cc +++ b/lib/local-execution/test/src/test_task_registry.cc @@ -1,8 +1,8 @@ #include "doctest/doctest.h" #include "kernels/local_cuda_allocator.h" #include "local-execution/local_cost_estimator.h" -#include "local-execution/task_signature_impl.h" #include "pcg/computation_graph_builder.h" +#include "task-spec/task_signature_impl.h" #include "utils/fmt/optional.h" #include "utils/fmt/unordered_map.h" @@ -12,8 +12,8 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("TaskRegistry") { layer_guid_t layer_guid = layer_guid_t{Node{0}}; - nonnegative_int embed_dim = 32_n; - nonnegative_int num_heads = 10_n; + positive_int embed_dim = 32_p; + positive_int num_heads = 10_p; ComputationGraphOpAttrs attrs = ComputationGraphOpAttrs{MultiHeadAttentionAttrs{ /*embed_dim=*/embed_dim, @@ -80,7 +80,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("different attrs, still same task fn mapping") { layer_guid_t layer_1 = layer_guid_t{Node{1}}; - nonnegative_int embed_dim = 100_n; + positive_int embed_dim = 100_p; layer_guid_t layer_2 = layer_guid_t{Node{2}}; ComputationGraphOpAttrs other_attrs = ComputationGraphOpAttrs{MultiHeadAttentionAttrs{ @@ -112,7 +112,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("equality") { SUBCASE("different attrs is still equal") { - nonnegative_int embed_dim = 100_n; + positive_int embed_dim = 100_p; ComputationGraphOpAttrs other_attrs = ComputationGraphOpAttrs{MultiHeadAttentionAttrs{ /*embed_dim=*/embed_dim, diff --git a/lib/local-execution/test/src/test_unallocated_tensors.cc b/lib/local-execution/test/src/test_unallocated_tensors.cc index 82f5a132fe..0a0b99e61c 100644 --- a/lib/local-execution/test/src/test_unallocated_tensors.cc +++ b/lib/local-execution/test/src/test_unallocated_tensors.cc @@ -1,6 +1,6 @@ +#include "kernels/local_cpu_allocator.h" #include "local-execution/allocated_tensors.h" #include "local-execution/gradient_tensor_source.h" -#include "local-execution/local_cpu_allocator.h" #include "local-execution/loss_tensor_source.h" #include "local-execution/optimizer_tensor_source.h" #include "local-execution/unallocated_tensors.h" @@ -38,16 +38,13 @@ TEST_SUITE(FF_TEST_SUITE) { optimizer_tensor_source.new_optimizer_tensor(); TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_n, 10_n}}, - DataType::FLOAT}, + TensorShape{TensorDims{FFOrdered{16_p, 10_p}}, DataType::FLOAT}, CreateGrad::NO}; TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_n, 20_n}}, - DataType::FLOAT}, + TensorShape{TensorDims{FFOrdered{16_p, 20_p}}, DataType::FLOAT}, CreateGrad::NO}; TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_n, 30_n}}, - DataType::FLOAT}, + TensorShape{TensorDims{FFOrdered{16_p, 30_p}}, DataType::FLOAT}, CreateGrad::YES}; GenericTensorAccessorW tensor_backing_1 = diff --git a/lib/local-execution/test/src/test_update.cc b/lib/local-execution/test/src/test_update.cc index 75ba517d1b..54c64e6b6c 100644 --- a/lib/local-execution/test/src/test_update.cc +++ b/lib/local-execution/test/src/test_update.cc @@ -11,11 +11,13 @@ using namespace ::FlexFlow; -TEST_SUITE(FF_TEST_SUITE) { +TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("ExecuteUpdate") { // initialize runtime configs ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); Allocator allocator = create_local_cuda_memory_allocator(); AllocatedTensors allocated_tensors = make_empty_allocated_tensors(); @@ -23,17 +25,15 @@ TEST_SUITE(FF_TEST_SUITE) { // construct computation graph ComputationGraph computation_graph = make_empty_computation_graph(); - nonnegative_int batch_size = 10_n; - nonnegative_int data_dim = 16_n; - nonnegative_int output_dim = 32_n; + positive_int batch_size = 10_p; + positive_int data_dim = 16_p; + positive_int output_dim = 32_p; TensorShape input_tensor_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, data_dim}}, - DataType::FLOAT}; + TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; TensorShape weight_shape = TensorShape{ - TensorDims{FFOrdered{data_dim, output_dim}}, - DataType::FLOAT}; + TensorDims{FFOrdered{data_dim, output_dim}}, DataType::FLOAT}; LayerAddedResult inputs_layer = add_input_layer(computation_graph, input_tensor_shape); From d5a57ba5f549a0dc420cae6d9e90f7128dd48476 Mon Sep 17 00:00:00 2001 From: fruitea Date: Wed, 18 Jun 2025 12:05:48 -0700 Subject: [PATCH 86/91] feat: test e2e for realm-backend --- .proj.toml | 6 +- .../realm-backend/model_training_instance.h | 2 +- .../include/realm-backend/realm_allocator.h | 4 +- .../realm_task_argument_accessor.h | 2 +- .../realm_tensor_backing.struct.toml | 3 - .../src/model_training_instance.cc | 21 +++- lib/realm-backend/src/realm_allocator.cc | 4 + .../src/realm_task_argument_accessor.cc | 7 +- .../src/realm_training_backing.cc | 2 +- lib/realm-backend/test/src/test_e2e.cc | 105 ++++++++++-------- 10 files changed, 92 insertions(+), 64 deletions(-) diff --git a/.proj.toml b/.proj.toml index 56faaec75d..20a10c98da 100644 --- a/.proj.toml +++ b/.proj.toml @@ -72,8 +72,10 @@ has-cuda-benchmarks = false [targets.realm-backend] type = "lib" -tests = false -benchmarks = false +has-cpu-only-tests = true +has-cpu-only-benchmarks = false +has-cuda-tests = true +has-cuda-benchmarks = false [targets.models] type = "lib" diff --git a/lib/realm-backend/include/realm-backend/model_training_instance.h b/lib/realm-backend/include/realm-backend/model_training_instance.h index 049836d042..b1580b0305 100644 --- a/lib/realm-backend/include/realm-backend/model_training_instance.h +++ b/lib/realm-backend/include/realm-backend/model_training_instance.h @@ -28,7 +28,7 @@ struct ModelTrainingInstance { PerLayerElapsedTime forward(); PerLayerElapsedTime backward(); void update(); - void write_loss_tensor_to_host(float *host_ptr); + GenericTensorAccessorR get_loss_tensor_accessor() const; }; } // namespace FlexFlow diff --git a/lib/realm-backend/include/realm-backend/realm_allocator.h b/lib/realm-backend/include/realm-backend/realm_allocator.h index 304ca38e32..2c6c854837 100644 --- a/lib/realm-backend/include/realm-backend/realm_allocator.h +++ b/lib/realm-backend/include/realm-backend/realm_allocator.h @@ -8,8 +8,6 @@ namespace FlexFlow { -struct RealmAllocatorImpl; - struct RealmAllocatorImpl : public IAllocator { RealmAllocatorImpl() = delete; RealmAllocatorImpl(RealmAllocatorImpl const &) = delete; @@ -20,6 +18,8 @@ struct RealmAllocatorImpl : public IAllocator { void *allocate(size_t) override; void deallocate(void *) override; + DeviceType get_allocation_device_type() const override; + private: std::unordered_map ptrs; Realm::Processor proc; diff --git a/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h b/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h index 256e69c301..0e83a3de6f 100644 --- a/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h +++ b/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h @@ -1,9 +1,9 @@ #ifndef _FLEXFLOW_REALM_BACKEND_REALM_TASK_ARGUMENT_ACCESSOR_H #define _FLEXFLOW_REALM_BACKEND_REALM_TASK_ARGUMENT_ACCESSOR_H -#include "local-execution/task_argument_accessor.h" #include "realm-backend/realm_allocator.h" #include "task-spec/slot_tensor_type_id.dtg.h" +#include "task-spec/task_argument_accessor.h" #include #include diff --git a/lib/realm-backend/include/realm-backend/realm_tensor_backing.struct.toml b/lib/realm-backend/include/realm-backend/realm_tensor_backing.struct.toml index 92a074e4fc..d53071dd0e 100644 --- a/lib/realm-backend/include/realm-backend/realm_tensor_backing.struct.toml +++ b/lib/realm-backend/include/realm-backend/realm_tensor_backing.struct.toml @@ -3,7 +3,6 @@ name = "RealmTensorBacking" features = [ "eq", "fmt", - "hash" ] includes = [ @@ -15,9 +14,7 @@ includes = [ ] src_includes = [ - "utils/hash/unordered_map.h", "utils/fmt/unordered_map.h", - "utils/hash/vector.h", "utils/fmt/vector.h", ] diff --git a/lib/realm-backend/src/model_training_instance.cc b/lib/realm-backend/src/model_training_instance.cc index 0c318f8942..87b8121bd5 100644 --- a/lib/realm-backend/src/model_training_instance.cc +++ b/lib/realm-backend/src/model_training_instance.cc @@ -1,6 +1,7 @@ #include "pcg/computation_graph.h" #include "pcg/optimizer_attrs.h" #include "realm-backend/model_training_instance.h" +#include "kernels/format_accessor_contents.h" #include "utils/containers/reversed.h" namespace FlexFlow { @@ -39,6 +40,13 @@ PerLayerElapsedTime ModelTrainingInstance::backward() { this->logit_tensor, this->label_tensor); + gradient_tensor_t loss_tensor = + this->training_backing.realm_tensor_backing.tensor_gradient_mapping.at( + this->logit_tensor); + GenericTensorAccessorW loss_tensor_backing = + this->training_backing.realm_tensor_backing.tensor_backings.at( + TensorTypeVariant{loss_tensor}); + PerLayerElapsedTime per_layer_elapsed_time; std::unordered_map> per_layer_elapsed_time_future; @@ -73,14 +81,19 @@ void ModelTrainingInstance::update() { this->optimizer_attrs); } -void ModelTrainingInstance::write_loss_tensor_to_host(float *host_ptr) { +GenericTensorAccessorR ModelTrainingInstance::get_loss_tensor_accessor() const { + GenericTensorAccessorW logit_tensor_backing = this->training_backing + .realm_tensor_backing.tensor_backings.at(TensorTypeVariant{this->logit_tensor}); + + gradient_tensor_t loss_tensor = - this->training_backing.realm_tensor_backing - .tensor_gradient_mapping.at(this->logit_tensor); + this->training_backing.realm_tensor_backing.tensor_gradient_mapping.at( + this->logit_tensor); GenericTensorAccessorW loss_tensor_backing = this->training_backing.realm_tensor_backing.tensor_backings.at( TensorTypeVariant{loss_tensor}); - write_to_host_float_ptr(loss_tensor_backing, host_ptr); + + return read_only_accessor_from_write_accessor(loss_tensor_backing); } } // namespace FlexFlow diff --git a/lib/realm-backend/src/realm_allocator.cc b/lib/realm-backend/src/realm_allocator.cc index d7139210bc..d8c60f375f 100644 --- a/lib/realm-backend/src/realm_allocator.cc +++ b/lib/realm-backend/src/realm_allocator.cc @@ -36,6 +36,10 @@ void RealmAllocatorImpl::deallocate(void *ptr) { } } +DeviceType RealmAllocatorImpl::get_allocation_device_type() const { + return DeviceType::GPU; +} + Allocator create_realm_memory_allocator(Processor proc) { return Allocator::create(proc); } diff --git a/lib/realm-backend/src/realm_task_argument_accessor.cc b/lib/realm-backend/src/realm_task_argument_accessor.cc index c7e81da01d..b7f10772e0 100644 --- a/lib/realm-backend/src/realm_task_argument_accessor.cc +++ b/lib/realm-backend/src/realm_task_argument_accessor.cc @@ -24,8 +24,8 @@ GenericTensorAccessor RealmTaskArgumentAccessor::get_tensor( auto tensor_backing = std::get( this->tensor_slots_backing.at(slot_tensor_type)); if (priv == Permissions::RO) { - GenericTensorAccessorR readonly_tensor_backing = { - tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr}; + GenericTensorAccessorR readonly_tensor_backing = + read_only_accessor_from_write_accessor(tensor_backing); return readonly_tensor_backing; } else if (priv == Permissions::RW || priv == Permissions::WO) { return tensor_backing; @@ -33,6 +33,7 @@ GenericTensorAccessor RealmTaskArgumentAccessor::get_tensor( throw mk_runtime_error(fmt::format("Unhandled privilege mode {}", priv)); } } + VariadicGenericTensorAccessor RealmTaskArgumentAccessor::get_variadic_tensor( slot_id_t slot, Permissions priv, TensorType tensor_type) const { SlotTensorTypeId slot_tensor_type = SlotTensorTypeId{slot, tensor_type}; @@ -43,7 +44,7 @@ VariadicGenericTensorAccessor RealmTaskArgumentAccessor::get_variadic_tensor( for (GenericTensorAccessorW const &tensor_backing : variadic_tensor_backing) { readonly_variadic_tensor_backing.push_back( - {tensor_backing.data_type, tensor_backing.shape, tensor_backing.ptr}); + read_only_accessor_from_write_accessor(tensor_backing)); } return readonly_variadic_tensor_backing; } else if (priv == Permissions::RW || priv == Permissions::WO) { diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc index 4e36bf8d5c..053bf62838 100644 --- a/lib/realm-backend/src/realm_training_backing.cc +++ b/lib/realm-backend/src/realm_training_backing.cc @@ -1,7 +1,6 @@ #include "kernels/allocation.h" #include "local-execution/loss_functions.h" #include "local-execution/optimizer.h" -#include "local-execution/task_signature_impl.h" #include "pcg/computation_graph.dtg.h" #include "pcg/computation_graph.h" #include "pcg/optimizer_attrs.h" @@ -9,6 +8,7 @@ #include "task-spec/op_task_to_task_invocation.h" #include "task-spec/runtime_arg_config.h" #include "task-spec/task_invocation.h" +#include "task-spec/task_signature_impl.h" #include "utils/containers/contains.h" #include "utils/containers/contains_key.h" #include "utils/containers/get_only.h" diff --git a/lib/realm-backend/test/src/test_e2e.cc b/lib/realm-backend/test/src/test_e2e.cc index ba180494c3..ea8ca883bd 100644 --- a/lib/realm-backend/test/src/test_e2e.cc +++ b/lib/realm-backend/test/src/test_e2e.cc @@ -1,5 +1,7 @@ -#include "kernels/managed_ff_stream.h" -#include "kernels/managed_per_device_ff_handle.h" +#include "kernels/compare_tensor_accessors.h" +#include "kernels/format_accessor_contents.h" +#include "kernels/tensor_accessor_reductions.h" +#include "kernels/test_utils.h" #include "local-execution/allocated_tensors.h" #include "realm-backend/realm_allocator.h" #include "realm-backend/realm_training_backing.h" @@ -14,20 +16,21 @@ using namespace ::FlexFlow; using namespace Realm; -bool did_loss_decrease(float *first_epoch, float *last_epoch, int batch_size) { - for (int i = 0; i < batch_size; i++) { - if (first_epoch[i] < last_epoch[i]) { - return false; - } - } - return true; +bool did_loss_decrease(GenericTensorAccessorR const &first_epoch, + GenericTensorAccessorR const &last_epoch) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + return tensor_accessor_all( + compare_tensor_accessors_le(last_epoch, first_epoch, cpu_allocator)); } void top_level_task(const void *args, size_t arglen, const void *userdata, size_t userlen, Realm::Processor p) { // initialize runtime ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); std::vector worker_procs; std::vector allocators; Machine::ProcessorQuery pq = Machine::ProcessorQuery(Machine::get_machine()) @@ -42,36 +45,37 @@ void top_level_task(const void *args, size_t arglen, const void *userdata, LossTensorSource loss_tensor_source; loss_tensor_t label_tensor = loss_tensor_source.new_loss_tensor(); - nonnegative_int batch_size = 10_n; - nonnegative_int data_dim = 16_n; - nonnegative_int hidden_dim = 32_n; - nonnegative_int output_dim = 1_n; + positive_int batch_size = 10_p; + positive_int data_dim = 16_p; + positive_int hidden_dim = 32_p; + positive_int output_dim = 1_p; + TensorShape input_tensor_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; TensorShape output_tensor_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, output_dim}}, - DataType::FLOAT}; + TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT}; - GenericTensorAccessorW label_tensor_backing = - allocators[0].allocate_tensor(output_tensor_shape); - AllocatedTensors allocated_tensors = AllocatedTensors{ - {{TensorTypeVariant{label_tensor}, label_tensor_backing}}, {}, {}}; + GenericTensorAccessorW label_tensor_backing = create_random_filled_accessor_w( + output_tensor_shape, allocators[0]); // construct computation graph ComputationGraph computation_graph = make_empty_computation_graph(); - TensorShape input_tensor_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, data_dim}}, - DataType::FLOAT}; - TensorShape weight_shape_1 = TensorShape{ - TensorDims{FFOrdered{data_dim, hidden_dim}}, - DataType::FLOAT}; + TensorDims{FFOrdered{data_dim, hidden_dim}}, DataType::FLOAT}; TensorShape weight_shape_2 = TensorShape{ - TensorDims{FFOrdered{hidden_dim, output_dim}}, - DataType::FLOAT}; + TensorDims{FFOrdered{hidden_dim, output_dim}}, DataType::FLOAT}; + + GenericTensorAccessorW weight_1_backing = create_random_filled_accessor_w( + weight_shape_1, allocators[0]); + GenericTensorAccessorW weight_2_backing = create_random_filled_accessor_w( + weight_shape_2, allocators[0]); LayerAddedResult inputs_layer = add_input_layer_with_grad(computation_graph, input_tensor_shape); + tensor_guid_t input_tensor_guid = get_only(inputs_layer.outputs); + GenericTensorAccessorW input_tensor_backing = create_random_filled_accessor_w( + input_tensor_shape, allocators[0]); LayerAddedResult weights_layer_1 = add_layer( computation_graph, @@ -80,6 +84,7 @@ void top_level_task(const void *args, size_t arglen, const void *userdata, std::nullopt}, {}, {}); + tensor_guid_t weight_1_tensor_guid = get_only(weights_layer_1.outputs); LayerAddedResult weights_layer_2 = add_layer( computation_graph, @@ -88,13 +93,14 @@ void top_level_task(const void *args, size_t arglen, const void *userdata, std::nullopt}, {}, {}); + tensor_guid_t weight_2_tensor_guid = get_only(weights_layer_2.outputs); LayerAddedResult linear_operator_1 = add_layer( computation_graph, LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{hidden_dim, /*use_bias=*/false, DataType::FLOAT, - Activation::RELU, + std::nullopt, std::nullopt}}, std::nullopt}, inputs_layer.outputs, @@ -105,7 +111,7 @@ void top_level_task(const void *args, size_t arglen, const void *userdata, LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim, /*use_bias=*/false, DataType::FLOAT, - Activation::RELU, + std::nullopt, std::nullopt}}, std::nullopt}, linear_operator_1.outputs, @@ -130,6 +136,17 @@ void top_level_task(const void *args, size_t arglen, const void *userdata, GradientTensorSource gradient_tensor_source; OptimizerTensorSource optimizer_tensor_source; + AllocatedTensors allocated_tensors = AllocatedTensors{ + /*tensor_type_backings=*/{ + {TensorTypeVariant{label_tensor}, label_tensor_backing}, + {TensorTypeVariant{input_tensor_guid}, input_tensor_backing}, + {TensorTypeVariant{weight_1_tensor_guid}, weight_1_backing}, + {TensorTypeVariant{weight_2_tensor_guid}, weight_2_backing}, + }, + /*gradient_mapping=*/{}, + /*optimizer_mapping*/ {}, + }; + { printf("\nRunning test %d: E2ETest...\n", 1); RealmTrainingBacking realm_training_backing = RealmTrainingBacking( @@ -141,32 +158,26 @@ void top_level_task(const void *args, size_t arglen, const void *userdata, realm_training_backing, logit_tensor, label_tensor, loss_attrs, optimizer_attrs }; + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + int num_epochs = 5; - int num_samples = batch_size.unwrap_nonnegative(); - std::vector loss_values(num_epochs); + std::vector loss_values; for (int i = 0; i < num_epochs; i++) { model_training_instance.forward(); model_training_instance.backward(); model_training_instance.update(); - float *host_loss_ptr = new float[num_samples]; - model_training_instance.write_loss_tensor_to_host(host_loss_ptr); - loss_values[i] = host_loss_ptr; + loss_values.push_back(copy_tensor_accessor_r( + model_training_instance.get_loss_tensor_accessor(), cpu_allocator)); } // Assert that each sample in the batch has a lower loss in last epoch than // the first epoch - float *first_epoch = loss_values[0]; - float *last_epoch = loss_values[num_epochs - 1]; - if(did_loss_decrease( - first_epoch, last_epoch, batch_size.unwrap_nonnegative())) { - printf("passed\n"); - } else { - printf("failed\n"); - } + GenericTensorAccessorR first_epoch_loss = loss_values.at(0); + + GenericTensorAccessorR last_epoch = loss_values.back(); - for (int i = 0; i < num_epochs; i++) { - delete[] loss_values[i]; - } + assert(did_loss_decrease(first_epoch_loss, last_epoch)); + printf("passed\n"); } -} \ No newline at end of file +} From 32971ef0c652ab4c5d3236470003d496791b48a5 Mon Sep 17 00:00:00 2001 From: fruitea Date: Wed, 18 Jun 2025 19:13:28 -0700 Subject: [PATCH 87/91] tweak: minor --- lib/local-execution/src/local_cost_estimator.cc | 2 -- lib/local-execution/test/src/test_e2e.cc | 4 ++-- lib/realm-backend/test/src/test_e2e.cc | 4 ++-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc index 0a84c19066..85f315c7d1 100644 --- a/lib/local-execution/src/local_cost_estimator.cc +++ b/lib/local-execution/src/local_cost_estimator.cc @@ -95,10 +95,8 @@ CostDetails LocalCostEstimator::estimate_cost( float fwd = execute_forward(local_backing, operator_layer_guid, allocator).value(); - std::cout << "completed forward" << std::endl; float bwd = execute_backward(local_backing, operator_layer_guid, allocator).value(); - std::cout << "completed backward" << std::endl; float total_execution_time = fwd + bwd; diff --git a/lib/local-execution/test/src/test_e2e.cc b/lib/local-execution/test/src/test_e2e.cc index de759e2e01..f1c83e76a0 100644 --- a/lib/local-execution/test/src/test_e2e.cc +++ b/lib/local-execution/test/src/test_e2e.cc @@ -93,7 +93,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{hidden_dim, /*use_bias=*/false, DataType::FLOAT, - std::nullopt, + Activation::RELU, std::nullopt}}, std::nullopt}, inputs_layer.outputs, @@ -104,7 +104,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim, /*use_bias=*/false, DataType::FLOAT, - std::nullopt, + Activation::RELU, std::nullopt}}, std::nullopt}, linear_operator_1.outputs, diff --git a/lib/realm-backend/test/src/test_e2e.cc b/lib/realm-backend/test/src/test_e2e.cc index ea8ca883bd..fa0976991d 100644 --- a/lib/realm-backend/test/src/test_e2e.cc +++ b/lib/realm-backend/test/src/test_e2e.cc @@ -100,7 +100,7 @@ void top_level_task(const void *args, size_t arglen, const void *userdata, LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{hidden_dim, /*use_bias=*/false, DataType::FLOAT, - std::nullopt, + Activation::RELU, std::nullopt}}, std::nullopt}, inputs_layer.outputs, @@ -111,7 +111,7 @@ void top_level_task(const void *args, size_t arglen, const void *userdata, LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim, /*use_bias=*/false, DataType::FLOAT, - std::nullopt, + Activation::RELU, std::nullopt}}, std::nullopt}, linear_operator_1.outputs, From a1a8c14583a709e8f48a575230cbc2e500557f8d Mon Sep 17 00:00:00 2001 From: fruitea Date: Wed, 18 Jun 2025 20:33:03 -0700 Subject: [PATCH 88/91] tweak: minor --- lib/realm-backend/src/realm_allocator.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/realm-backend/src/realm_allocator.cc b/lib/realm-backend/src/realm_allocator.cc index d8c60f375f..287de0f2d5 100644 --- a/lib/realm-backend/src/realm_allocator.cc +++ b/lib/realm-backend/src/realm_allocator.cc @@ -21,6 +21,11 @@ void *RealmAllocatorImpl::allocate(size_t requested_memory_size) { RegionInstance::create_instance(requested_instance, mem, bounds, field_sizes, /*SOA*/ 1, ProfilingRequestSet()) .wait(); + // TODO: looks like no need to do this because the memory is already zeroed out + // char *zero_data = new char[requested_memory_size]; + // memset(zero_data, 0, requested_memory_size); + // requested_instance.write_untyped(0, (const void *)zero_data, requested_memory_size); + // delete[] zero_data; void *ptr = requested_instance.pointer_untyped(0, 0); this->ptrs.insert({ptr, requested_instance}); return ptr; From 4e3fb7d72282b4258f8faee4d024b525239ce57b Mon Sep 17 00:00:00 2001 From: fruitea Date: Sun, 3 Aug 2025 19:27:42 -0700 Subject: [PATCH 89/91] feat: reconstrcut realm backend --- .flake/pkgs/ffdb/ffdb.py | 1 + .../src/export_model_arch.cc | 10 +- flake.lock | 18 +- flake.nix | 4 +- .../compiler/cost_estimator/cost_estimator.h | 4 +- .../cost_estimator/op_cost_estimate_key.h | 11 +- .../op_cost_estimate_key.struct.toml | 5 + .../compiler/cost_estimator/op_cost_metrics.h | 15 + .../op_cost_metrics.struct.toml | 11 +- .../runtime_only_cost_estimator.h | 52 ++ ..._only_cost_estimator_from_cost_estimator.h | 28 + .../runtime_only_op_cost_estimate_key.h | 18 + ...time_only_op_cost_estimate_key.struct.toml | 40 ++ .../runtime_only_op_cost_metrics.h | 14 + .../runtime_only_op_cost_metrics.struct.toml | 19 + ...easible_machine_mapping_result.struct.toml | 3 +- .../get_optimal_machine_mapping.h | 12 +- .../machine_mapping_context.struct.toml | 8 +- .../machine_mapping_problem_tree.h | 11 +- .../machine_mapping_problem_tree.variant.toml | 4 +- .../unmapped_op_cost_estimate_key.h | 13 +- .../unmapped_op_cost_estimate_key.struct.toml | 5 +- ...mapped_runtime_only_op_cost_estimate_key.h | 22 + ...time_only_op_cost_estimate_key.struct.toml | 34 ++ .../machine_mapping/machine_mapping_result.h | 5 +- .../get_optimal_machine_mapping_with_memory.h | 13 +- ...ne_mapping_with_memory_context.struct.toml | 23 + .../machine_mapping_with_memory_result.h | 2 +- ...hine_mapping_with_memory_state.struct.toml | 30 + .../pcg_task.variant.toml | 4 +- .../task_graph_simulator/task_simulator.h | 7 +- .../compiler/cost_estimator/cost_estimator.cc | 2 +- .../cost_estimator/op_cost_estimate_key.cc | 33 +- .../cost_estimator/op_cost_metrics.cc | 16 + .../runtime_only_cost_estimator.cc | 19 + ...only_cost_estimator_from_cost_estimator.cc | 45 ++ .../runtime_only_op_cost_estimate_key.cc | 17 + .../runtime_only_op_cost_metrics.cc | 14 + .../get_optimal_machine_mapping.cc | 28 +- .../get_machine_mapping_problem_tree.cc | 5 +- .../machine_mapping_problem_tree.cc | 19 +- .../unmapped_op_cost_estimate_key.cc | 39 +- ...apped_runtime_only_op_cost_estimate_key.cc | 39 ++ .../machine_mapping/machine_mapping_result.cc | 4 +- ...get_optimal_machine_mapping_with_memory.cc | 22 +- .../machine_mapping_with_memory_result.cc | 18 +- .../task_graph_simulator/pcg_task_graph.cc | 6 +- .../task_graph_execution_trace.cc | 6 +- .../task_graph_simulator/task_simulator.cc | 23 +- .../get_optimal_machine_mapping.cc | 126 +++-- .../get_tensor_set_movement_across_split.cc | 2 +- .../get_machine_mapping_problem_tree.cc | 54 +- .../machine_mapping/machine_mapping_result.cc | 16 +- ...get_optimal_machine_mapping_with_memory.cc | 102 ++-- .../machine_mapping_result_with_memory.cc | 71 +-- .../get_pcg_series_parallel_decomposition.cc | 2 +- .../task_graph_simulator/task_simulator.cc | 190 ++++--- .../cost_estimator_for_test.cc | 18 +- .../cost_estimator_for_test.h | 17 +- .../runtime_only_cost_estimator_for_test.cc | 52 ++ .../runtime_only_cost_estimator_for_test.h | 26 + lib/kernels/include/kernels/accessor.h | 107 ++-- lib/kernels/include/kernels/allocation.h | 4 +- lib/kernels/include/kernels/array_coord.h | 14 - lib/kernels/include/kernels/array_shape.h | 79 --- .../include/kernels/attention_kernels.h | 116 ++-- .../include/kernels/attention_kernels_cpu.h | 31 ++ .../include/kernels/attention_kernels_gpu.h | 52 ++ .../include/kernels/batch_matmul_kernels.h | 12 +- .../kernels/batch_matmul_kernels_cpu.h | 32 ++ .../kernels/batch_matmul_kernels_gpu.h | 38 ++ .../include/kernels/batch_norm_kernels.h | 39 +- .../include/kernels/batch_norm_kernels_cpu.h | 28 + .../include/kernels/batch_norm_kernels_gpu.h | 43 ++ lib/kernels/include/kernels/cast_kernels.h | 10 +- .../include/kernels/cast_kernels_cpu.h | 5 +- .../include/kernels/cast_kernels_gpu.h | 19 + lib/kernels/include/kernels/combine_kernels.h | 19 - .../include/kernels/combine_kernels_cpu.h | 17 - lib/kernels/include/kernels/concat_kernels.h | 6 +- .../include/kernels/concat_kernels_cpu.h | 19 + .../include/kernels/concat_kernels_gpu.h | 21 + lib/kernels/include/kernels/conv_2d_kernels.h | 99 ++-- .../include/kernels/conv_2d_kernels_cpu.h | 26 + .../include/kernels/conv_2d_kernels_gpu.h | 44 ++ .../conv_2d_per_device_state.struct.toml | 48 ++ .../include/kernels/copy_tensor_accessor.h | 2 +- .../kernels/create_accessor_with_contents.h | 17 +- .../create_local_allocator_for_device_type.h | 12 + lib/kernels/include/kernels/device_handle_t.h | 17 + .../kernels/device_handle_t.variant.toml | 16 + lib/kernels/include/kernels/device_stream_t.h | 15 + .../kernels/device_stream_t.variant.toml | 16 + lib/kernels/include/kernels/dropout_kernels.h | 83 +-- .../include/kernels/dropout_kernels_cpu.h | 12 + .../include/kernels/dropout_kernels_gpu.h | 33 ++ .../dropout_per_device_state.struct.toml | 40 ++ .../include/kernels/element_binary_kernels.h | 102 ++-- .../kernels/element_binary_kernels_cpu.h | 25 + .../kernels/element_binary_kernels_gpu.h | 43 ++ ...lement_binary_per_device_state.struct.toml | 32 ++ .../include/kernels/element_unary_kernels.h | 76 ++- .../kernels/element_unary_kernels_cpu.h | 22 + .../kernels/element_unary_kernels_gpu.h | 36 ++ ...element_unary_per_device_state.struct.toml | 19 + .../include/kernels/embedding_kernels.h | 16 +- .../include/kernels/embedding_kernels_cpu.h | 31 ++ .../include/kernels/embedding_kernels_gpu.h | 33 ++ lib/kernels/include/kernels/ff_handle.h | 23 +- .../include/kernels/fill_tensor_accessor.h | 2 +- lib/kernels/include/kernels/flat_kernels.h | 10 +- .../include/kernels/flat_kernels_cpu.h | 16 + .../include/kernels/flat_kernels_gpu.h | 20 + lib/kernels/include/kernels/gather_kernels.h | 39 +- .../include/kernels/gather_kernels_cpu.h | 18 + .../include/kernels/gather_kernels_gpu.h | 27 + .../gather_per_device_state.struct.toml | 16 + .../include/kernels/layer_norm_kernels.h | 93 ++-- .../include/kernels/layer_norm_kernels_cpu.h | 22 + .../include/kernels/layer_norm_kernels_gpu.h | 39 ++ .../layer_norm_per_device_state.struct.toml | 57 ++ lib/kernels/include/kernels/legion_dim.h | 4 + .../kernels/legion_ordered/legion_ordered.h | 8 +- lib/kernels/include/kernels/linear_kernels.h | 107 ++-- .../include/kernels/linear_kernels_cpu.h | 29 + .../include/kernels/linear_kernels_gpu.h | 49 ++ .../linear_per_device_state.struct.toml | 56 ++ .../include/kernels/loss_function_kernels.h | 25 +- .../kernels/loss_function_kernels_cpu.h | 41 ++ .../kernels/loss_function_kernels_gpu.h | 45 ++ .../kernels/managed_per_device_ff_handle.h | 7 + .../include/kernels/map_tensor_accessors.h | 107 ++-- .../kernels/mha_per_device_state.struct.toml | 65 +++ .../include/kernels/optimizer_kernels.h | 79 +-- .../include/kernels/optimizer_kernels_cpu.h | 31 ++ .../include/kernels/optimizer_kernels_gpu.h | 59 ++ .../include/kernels/partition_kernels.h | 34 -- .../partition_per_device_state.struct.toml | 16 + .../kernels/per_device_op_state.variant.toml | 82 --- lib/kernels/include/kernels/pool_2d_kernels.h | 108 ++-- .../include/kernels/pool_2d_kernels_cpu.h | 15 + .../include/kernels/pool_2d_kernels_gpu.h | 46 ++ .../pool_2d_per_device_state.struct.toml | 32 ++ lib/kernels/include/kernels/profiling.h | 74 ++- lib/kernels/include/kernels/reduce_kernels.h | 57 +- .../include/kernels/reduce_kernels_cpu.h | 12 + .../include/kernels/reduce_kernels_gpu.h | 30 + .../reduce_per_device_state.struct.toml | 33 ++ .../include/kernels/reduce_tensor_accessor.h | 29 +- .../include/kernels/reduction_kernels.h | 20 - .../include/kernels/replicate_kernels.h | 20 - .../include/kernels/replicate_kernels_cpu.h | 18 - lib/kernels/include/kernels/reshape_kernels.h | 24 +- .../include/kernels/reshape_kernels_cpu.h | 16 + .../include/kernels/reshape_kernels_gpu.h | 19 + lib/kernels/include/kernels/reverse_kernels.h | 9 +- .../include/kernels/reverse_kernels_cpu.h | 1 - .../include/kernels/reverse_kernels_gpu.h | 22 + .../include/kernels/reverse_kernels_params.h | 4 +- lib/kernels/include/kernels/softmax_kernels.h | 58 +- .../include/kernels/softmax_kernels_cpu.h | 16 + .../include/kernels/softmax_kernels_gpu.h | 32 ++ .../softmax_per_device_state.struct.toml | 21 + lib/kernels/include/kernels/split_kernels.h | 20 +- .../include/kernels/split_kernels_cpu.h | 22 + .../include/kernels/split_kernels_gpu.h | 26 + .../kernels/tensor_accessor_binary_ops.h | 48 ++ .../kernels/tensor_accessor_unary_ops.h | 50 ++ lib/kernels/include/kernels/topk_kernels.h | 25 +- .../include/kernels/topk_kernels_cpu.h | 25 + .../include/kernels/topk_kernels_gpu.h | 27 + .../include/kernels/transpose_kernels.h | 14 +- .../include/kernels/transpose_kernels_cpu.h | 19 + .../include/kernels/transpose_kernels_gpu.h | 22 + lib/kernels/src/cpu/ops/combine_kernels.cc | 40 -- .../src/cpu/ops/initializer_kernels.cc | 9 +- lib/kernels/src/cpu/ops/replicate_kernels.cc | 53 -- lib/kernels/src/cuda/cuda_helper.cu | 52 +- lib/kernels/src/cuda/embedding_kernels.cu | 128 +++-- lib/kernels/src/cuda/loss_function_kernels.cu | 44 +- lib/kernels/src/cuda/ops/attention_kernels.cu | 143 ++--- .../src/cuda/ops/batch_matmul_kernels.cu | 50 +- .../src/cuda/ops/batch_norm_kernels.cu | 69 ++- lib/kernels/src/cuda/ops/cast_kernels.cu | 22 +- lib/kernels/src/cuda/ops/combine_kernels.cu | 68 --- lib/kernels/src/cuda/ops/concat_kernels.cu | 40 +- lib/kernels/src/cuda/ops/conv_2d_kernels.cu | 119 ++-- lib/kernels/src/cuda/ops/dropout_kernels.cu | 64 +-- .../src/cuda/ops/element_binary_kernels.cu | 80 +-- .../src/cuda/ops/element_unary_kernels.cu | 73 +-- lib/kernels/src/cuda/ops/flat_kernels.cu | 33 +- lib/kernels/src/cuda/ops/gather_kernels.cu | 87 +-- .../src/cuda/ops/layer_norm_kernels.cu | 72 +-- lib/kernels/src/cuda/ops/linear_kernels.cu | 152 ++--- lib/kernels/src/cuda/ops/partition_kernels.cu | 79 --- lib/kernels/src/cuda/ops/pool_2d_kernels.cu | 70 ++- lib/kernels/src/cuda/ops/reduce_kernels.cu | 42 +- lib/kernels/src/cuda/ops/reduction_kernels.cu | 86 --- lib/kernels/src/cuda/ops/replicate_kernels.cu | 84 --- lib/kernels/src/cuda/ops/reshape_kernels.cu | 61 +- lib/kernels/src/cuda/ops/reverse_kernels.cu | 22 +- lib/kernels/src/cuda/ops/softmax_kernels.cu | 40 +- lib/kernels/src/cuda/ops/split_kernels.cu | 30 +- lib/kernels/src/cuda/ops/topk_kernels.cu | 39 +- lib/kernels/src/cuda/ops/transpose_kernels.cu | 54 +- lib/kernels/src/cuda/optimizer_kernels.cu | 14 +- lib/kernels/src/ff_handle.cc | 1 + lib/kernels/src/internal/device.h | 8 +- lib/kernels/src/kernels/accessor.cc | 109 ++-- lib/kernels/src/kernels/allocation.cc | 7 +- lib/kernels/src/kernels/array_shape.cc | 150 ----- lib/kernels/src/kernels/attention_kernels.cc | 125 +++++ .../src/kernels/attention_kernels_cpu.cc | 25 + .../src/kernels/batch_matmul_kernels.cc | 93 ++++ .../src/kernels/batch_matmul_kernels_cpu.cc | 31 ++ lib/kernels/src/kernels/batch_norm_kernels.cc | 107 ++++ .../src/kernels/batch_norm_kernels_cpu.cc | 25 + lib/kernels/src/kernels/cast_kernels.cc | 39 ++ .../cast_kernels_cpu.cc} | 8 +- lib/kernels/src/kernels/concat_kernels.cc | 45 ++ lib/kernels/src/kernels/concat_kernels_cpu.cc | 17 + lib/kernels/src/kernels/conv_2d_kernels.cc | 118 ++++ .../src/kernels/conv_2d_kernels_cpu.cc | 25 + .../src/kernels/copy_tensor_accessor.cc | 20 +- .../create_local_allocator_for_device_type.cc | 16 + lib/kernels/src/kernels/device_handle_t.cc | 24 + lib/kernels/src/kernels/device_stream_t.cc | 25 + lib/kernels/src/kernels/dropout_kernels.cc | 79 +++ .../src/kernels/dropout_kernels_cpu.cc | 14 + .../src/kernels/element_binary_kernels.cc | 116 ++++ .../src/kernels/element_binary_kernels_cpu.cc | 25 + .../src/kernels/element_unary_kernels.cc | 92 ++++ .../src/kernels/element_unary_kernels_cpu.cc | 19 + lib/kernels/src/kernels/embedding_kernels.cc | 81 +++ .../src/kernels/embedding_kernels_cpu.cc | 29 + .../src/kernels/fill_tensor_accessor.cc | 28 +- lib/kernels/src/kernels/flat_kernels.cc | 42 ++ lib/kernels/src/kernels/flat_kernels_cpu.cc | 16 + .../src/kernels/format_accessor_contents.cc | 145 ++++- lib/kernels/src/kernels/gather_kernels.cc | 66 +++ lib/kernels/src/kernels/gather_kernels_cpu.cc | 17 + lib/kernels/src/kernels/layer_norm_kernels.cc | 99 ++++ .../src/kernels/layer_norm_kernels_cpu.cc | 21 + lib/kernels/src/kernels/legion_dim.cc | 14 + lib/kernels/src/kernels/linear_kernels.cc | 148 +++++ lib/kernels/src/kernels/linear_kernels_cpu.cc | 96 ++++ .../src/kernels/loss_function_kernels.cc | 126 +++++ .../src/kernels/loss_function_kernels_cpu.cc | 51 ++ lib/kernels/src/kernels/optimizer_kernels.cc | 98 ++++ .../src/kernels/optimizer_kernels_cpu.cc | 76 +++ lib/kernels/src/kernels/pool_2d_kernels.cc | 105 ++++ .../src/kernels/pool_2d_kernels_cpu.cc | 17 + lib/kernels/src/kernels/reduce_kernels.cc | 62 +++ lib/kernels/src/kernels/reduce_kernels_cpu.cc | 14 + lib/kernels/src/kernels/reshape_kernels.cc | 39 ++ .../src/kernels/reshape_kernels_cpu.cc | 15 + lib/kernels/src/kernels/reverse_kernels.cc | 33 ++ .../reverse_kernels_cpu.cc} | 23 +- .../src/kernels/reverse_kernels_params.cc | 20 +- lib/kernels/src/kernels/softmax_kernels.cc | 79 +++ .../src/kernels/softmax_kernels_cpu.cc | 16 + lib/kernels/src/kernels/split_kernels.cc | 63 +++ lib/kernels/src/kernels/split_kernels_cpu.cc | 24 + .../src/kernels/tensor_accessor_binary_ops.cc | 143 +++++ .../src/kernels/tensor_accessor_reductions.cc | 4 +- .../src/kernels/tensor_accessor_unary_ops.cc | 247 +++++++++ lib/kernels/src/kernels/topk_kernels.cc | 67 +++ lib/kernels/src/kernels/topk_kernels_cpu.cc | 25 + lib/kernels/src/kernels/transpose_kernels.cc | 45 ++ .../src/kernels/transpose_kernels_cpu.cc | 17 + .../src/managed_per_device_ff_handle.cc | 13 + .../test/src/cpu/ops/replicate_kernels.cc | 59 -- .../{src => test/src/internal}/test_utils.cc | 39 +- .../src/internal}/test_utils.h | 2 - lib/kernels/test/src/kernels/accessor.cc | 57 +- lib/kernels/test/src/kernels/array_shape.cc | 87 --- .../src/kernels/compare_tensor_accessors.cc | 2 +- .../kernels/create_accessor_with_contents.cc | 9 +- .../src/kernels/format_accessor_contents.cc | 64 ++- lib/kernels/test/src/kernels/legion_dim.cc | 2 +- .../src/kernels/legion_ordered/transform.cc | 2 +- .../test/src/kernels/linear_kernels.cc | 263 +++++++++ .../test/src/kernels/linear_kernels_cpu.cc | 175 ++++++ .../test/src/kernels/map_tensor_accessors.cc | 12 +- .../src/kernels/reduce_tensor_accessor.cc | 2 +- .../reverse_kernels_cpu.cc} | 4 +- .../src/kernels/tensor_accessor_unary_ops.cc | 178 ++++++ lib/kernels/test/src/test_attention_kernel.cc | 16 +- .../test/src/test_batch_matmul_kernel.cc | 56 +- .../test/src/test_batch_norm_kernel.cc | 24 +- lib/kernels/test/src/test_cast_kernel.cc | 17 +- lib/kernels/test/src/test_combine_kernel.cc | 106 ---- lib/kernels/test/src/test_concat_kernel.cc | 24 +- lib/kernels/test/src/test_cuda.cc | 2 +- lib/kernels/test/src/test_dropout.cc | 38 +- lib/kernels/test/src/test_flat_kernel.cc | 22 +- lib/kernels/test/src/test_gather_kernels.cc | 45 +- .../test/src/test_layer_norm_kernels.cc | 36 +- .../test/src/test_managed_ff_stream.cc | 107 ---- lib/kernels/test/src/test_partition_kernel.cc | 52 -- lib/kernels/test/src/test_pool_2d_kernels.cc | 30 +- lib/kernels/test/src/test_reduction_kernel.cc | 57 -- lib/kernels/test/src/test_replicate_kernel.cc | 153 ------ lib/kernels/test/src/test_reshape_kernel.cc | 27 +- lib/kernels/test/src/test_reverse_kernels.cc | 38 +- lib/kernels/test/src/test_softmax_kernel.cc | 33 +- lib/kernels/test/src/test_split_kernel.cc | 43 +- lib/kernels/test/src/test_transpose_kernel.cc | 18 +- lib/local-execution/CMakeLists.txt | 1 + .../local-execution/allocated_tensors.h | 32 -- .../allocated_tensors.struct.toml | 30 - .../local-execution/cost_details.struct.toml | 18 - .../include/local-execution/cost_estimate.h | 63 --- .../include/local-execution/cost_metrics.h | 70 --- .../local-execution/gradient_tensor_source.h | 22 - .../local-execution/local_args_backing.h | 38 +- .../local_args_backing.struct.toml | 18 + .../local-execution/local_cost_estimator.h | 16 +- .../local_task_argument_accessor.h | 22 +- .../local-execution/local_task_registry.h | 24 + .../local_task_registry.struct.toml | 26 + .../local-execution/local_tensor_backing.h | 50 +- .../local_tensor_backing.struct.toml | 18 +- .../local-execution/local_training_backing.h | 90 ++- .../local_training_backing.struct.toml | 26 + .../local-execution/model_training_instance.h | 13 +- .../local-execution/operator_task_set.h | 24 + .../operator_task_set.struct.toml | 24 + .../local-execution/optimizer_tensor_source.h | 22 - .../include/local-execution/registered_task.h | 12 + .../registered_task_t.variant.toml | 27 + .../include/local-execution/task_registry.h | 21 - .../local-execution/task_registry.struct.toml | 35 -- .../tensor_slot_backing.variant.toml | 23 + .../local-execution/tracked_allocator.h | 3 +- .../local-execution/unallocated_tensors.h | 27 - .../unallocated_tensors.struct.toml | 31 -- lib/local-execution/src/allocated_tensors.cc | 145 ----- .../src/local-execution/local_args_backing.cc | 62 +++ .../local-execution/local_cost_estimator.cc | 165 ++++++ .../local-execution/local_task_registry.cc | 64 +++ .../local-execution/local_tensor_backing.cc | 74 +++ .../local-execution/local_training_backing.cc | 221 ++++++++ .../model_training_instance.cc | 85 +++ .../src/local-execution/operator_task_set.cc | 71 +++ .../src/local-execution/registered_task.cc | 9 + lib/local-execution/src/local_args_backing.cc | 46 -- .../src/local_cost_estimator.cc | 122 ---- .../src/local_task_argument_accessor.cc | 23 +- .../src/local_tensor_backing.cc | 95 ---- .../src/local_training_backing.cc | 264 --------- lib/local-execution/src/loss_tensor_source.cc | 13 - .../src/model_training_instance.cc | 80 --- lib/local-execution/src/task_binding.cc | 58 +- lib/local-execution/src/task_registry.cc | 78 --- lib/local-execution/src/tracked_allocator.cc | 4 +- .../src/unallocated_tensors.cc | 92 ---- .../test/src/{ => internal}/test_utils.cc | 2 +- .../test/src/{ => internal}/test_utils.h | 0 .../local-execution/local_cost_estimator.cc | 142 +++++ .../local_task_argument_accessor.cc} | 47 +- .../local-execution/local_task_registry.cc | 278 ++++++++++ .../local-execution/local_tensor_backing.cc | 285 ++++++++++ .../local_training_backing.cc} | 77 +-- .../loss_functions.cc} | 127 +++-- .../test/src/test_allocated_tensors.cc | 226 -------- lib/local-execution/test/src/test_e2e.cc | 241 ++++++-- .../test/src/test_local_cost_estimator.cc | 76 --- .../test/src/test_local_tensor_backing.cc | 146 ----- .../test/src/test_task_registry.cc | 216 -------- .../test/src/test_unallocated_tensors.cc | 440 --------------- lib/models/src/models/bert/bert.cc | 29 +- .../src/models/candle_uno/candle_uno.cc | 3 +- lib/models/src/models/dlrm/dlrm.cc | 5 +- .../src/models/transformer/transformer.cc | 4 +- lib/op-attrs/include/op-attrs/datatype.h | 2 +- .../include/op-attrs/datatype_value.h | 4 + .../op-attrs/datatype_value.variant.toml | 13 + lib/op-attrs/include/op-attrs/ff_dim_t.h | 4 + .../include/op-attrs/ff_ordered/ff_ordered.h | 13 +- .../include/op-attrs/ff_ordered/filtrans.h | 20 + .../include/op-attrs/ff_ordered/reversed.h | 16 + .../include/op-attrs/ff_ordered/slice.h | 5 +- .../include/op-attrs/ff_ordered/transform.h | 3 +- .../include/op-attrs/ff_ordered/zip.h | 3 +- .../include/op-attrs/ff_ordered/zip_with.cc | 14 + .../include/op-attrs/ff_ordered/zip_with.h | 22 + .../op-attrs/ops/gather_attrs.struct.toml | 5 +- .../op-attrs/ops/layer_norm_attrs.struct.toml | 11 +- lib/op-attrs/include/op-attrs/tensor_dims.h | 36 +- .../include/op-attrs/tensor_dims_coord.h | 17 + .../op-attrs/tensor_dims_coord.struct.toml} | 7 +- lib/op-attrs/include/op-attrs/tensor_shape.h | 11 +- lib/op-attrs/src/op-attrs/datatype_value.cc | 38 ++ lib/op-attrs/src/op-attrs/ff_dim_t.cc | 7 + .../src/op-attrs/ff_ordered/filtrans.cc | 12 + .../src/op-attrs/ff_ordered/reversed.cc | 10 + lib/op-attrs/src/op-attrs/ops/attention.cc | 7 +- .../attention/multihead_attention_inputs.cc | 31 +- lib/op-attrs/src/op-attrs/ops/batch_matmul.cc | 21 +- lib/op-attrs/src/op-attrs/ops/batch_norm.cc | 6 +- lib/op-attrs/src/op-attrs/ops/broadcast.cc | 4 +- lib/op-attrs/src/op-attrs/ops/concat.cc | 4 +- lib/op-attrs/src/op-attrs/ops/conv_2d.cc | 6 +- .../ops/conv_2d/conv_2d_input_shape.cc | 21 +- lib/op-attrs/src/op-attrs/ops/embedding.cc | 5 +- lib/op-attrs/src/op-attrs/ops/flat.cc | 4 +- lib/op-attrs/src/op-attrs/ops/layer_norm.cc | 6 +- lib/op-attrs/src/op-attrs/ops/linear.cc | 8 +- lib/op-attrs/src/op-attrs/ops/pool_2d.cc | 14 +- lib/op-attrs/src/op-attrs/ops/softmax.cc | 3 +- .../src/op-attrs/parallel_tensor_dims.cc | 10 +- lib/op-attrs/src/op-attrs/tensor_dims.cc | 138 ++++- .../src/op-attrs/tensor_dims_coord.cc} | 14 +- lib/op-attrs/src/op-attrs/tensor_shape.cc | 26 +- .../test/src/op-attrs/datatype_value.cc | 75 +++ .../test/src/op-attrs/ff_ordered/concat.cc | 18 +- .../test/src/op-attrs/ff_ordered/enumerate.cc | 2 +- .../ff_ordered/ff_ordered_from_map.cc | 2 +- .../test/src/op-attrs/ff_ordered/reversed.cc | 26 + .../test/src/op-attrs/ff_ordered/transform.cc | 2 +- .../test/src/op-attrs/ff_ordered/zip.cc | 31 +- .../test/src/op-attrs/ff_ordered/zip_with.cc | 80 +++ .../test/src/op-attrs/ops/element_binary.cc | 3 +- lib/op-attrs/test/src/op-attrs/ops/linear.cc | 16 +- lib/op-attrs/test/src/op-attrs/tensor_dims.cc | 129 +++++ .../test/src/op-attrs/tensor_dims_coord.cc} | 21 +- .../cg_operator_plus_signature.struct.toml | 23 + .../pcg/cg_operator_tensor_shape_signature.h | 20 + ...perator_tensor_shape_signature.struct.toml | 32 ++ .../include/pcg/computation_graph_builder.h | 2 +- .../pcg/file_format/v1/data_type_value.h | 2 +- lib/pcg/include/pcg/optimizer_attrs.h | 3 +- .../pcg_operator_plus_signature.struct.toml | 23 + ...perator_tensor_shape_signature.struct.toml | 31 ++ .../include/pcg}/tensor_role.enum.toml | 0 .../pcg/cg_operator_tensor_shape_signature.cc | 28 + lib/pcg/src/pcg/computation_graph_builder.cc | 46 +- lib/pcg/src/pcg/optimizer_attrs.cc | 10 +- .../parallel_computation_graph.cc | 9 +- lib/realm-backend/CMakeLists.txt | 1 + .../realm-backend/model_training_instance.h | 19 +- .../realm-backend/realm_args_backing.h | 38 -- .../realm_task_argument_accessor.h | 47 -- .../realm-backend/realm_tensor_backing.h | 47 -- .../realm_tensor_backing.struct.toml | 31 -- .../realm-backend/realm_training_backing.h | 90 ++- .../include/realm-backend/task_result.h | 1 - .../include/realm-backend/task_wrapper.h | 3 +- .../src/model_training_instance.cc | 132 +++-- lib/realm-backend/src/realm_args_backing.cc | 46 -- .../src/realm_task_argument_accessor.cc | 65 --- lib/realm-backend/src/realm_tensor_backing.cc | 94 ---- .../src/realm_training_backing copy.cc | 126 +++++ .../src/realm_training_backing.cc | 520 ++++++++---------- lib/realm-backend/src/task_wrapper.cc | 9 +- lib/realm-backend/test/src/test_e2e.cc | 328 +++++------ lib/realm-backend/test/src/test_update.cc | 14 +- lib/runtime/src/ops/embedding.cc | 120 ---- .../{concrete_arg.h => concrete_arg_spec.h} | 6 +- ...device_specific_device_states.variant.toml | 63 +-- ...toml => forward_tensor_guid_t.struct.toml} | 2 +- .../include/task-spec/forward_tensor_source.h | 22 + ...oml => gradient_tensor_guid_t.struct.toml} | 2 +- .../task-spec/gradient_tensor_source.h | 22 + .../task-spec/init_op_task_impl_function.h | 7 +- .../task-spec/itask_argument_accessor.h | 2 +- .../include/task-spec}/loss_functions.h | 15 +- ...ct.toml => loss_tensor_guid_t.struct.toml} | 2 +- .../include/task-spec}/loss_tensor_source.h | 8 +- lib/task-spec/include/task-spec/op_arg_ref.h | 8 +- .../task-spec/op_arg_spec.variant.toml | 2 +- .../include/task-spec/op_task_binding.h | 97 ++++ .../include/task-spec/op_task_invocation.h | 109 +--- .../task-spec/op_task_invocation.struct.toml | 16 + .../task-spec/op_task_to_task_invocation.h | 33 +- .../task-spec/op_tensor_slot_spec.struct.toml | 2 +- .../include/task-spec/op_tensor_spec.h | 16 +- .../task-spec/op_tensor_spec.struct.toml | 28 + lib/task-spec/include/task-spec/ops/combine.h | 23 - .../include/task-spec/ops/reduction.h | 24 - .../include/task-spec/ops/repartition.h | 26 - .../include/task-spec/ops/replicate.h | 23 - lib/task-spec/include/task-spec/ops/reshape.h | 3 - lib/task-spec/include/task-spec/ops/topk.h | 7 +- .../include/task-spec}/optimizer.h | 24 +- ...ml => optimizer_tensor_guid_t.struct.toml} | 2 +- .../task-spec/optimizer_tensor_source.h | 22 + ...parallel_tensor_shape_ref_type.struct.toml | 12 +- .../include/task-spec/per_device_op_state.h | 6 +- .../per_device_op_state.variant.toml | 64 +-- lib/task-spec/include/task-spec/profiling.h | 11 +- .../include/task-spec/runtime_arg_config.h | 21 +- .../task-spec/runtime_arg_config.struct.toml | 25 + .../include/task-spec/runtime_arg_ref.h | 18 +- .../task-spec/runtime_arg_ref_type.enum.toml | 17 + .../task-spec/task_arg_spec.variant.toml | 2 +- .../task-spec/task_argument_accessor.h | 14 +- .../include/task-spec/task_binding.h | 39 +- .../include/task-spec/task_id_t.enum.toml | 9 - .../include/task-spec/task_signature_impl.h | 10 +- ....toml => tensor_sub_slot_id_t.struct.toml} | 2 +- .../task-spec/tensor_type_t.variant.toml | 31 -- .../task-spec/training_computation_graph.h | 68 +++ .../training_computation_graph.struct.toml | 27 + .../task-spec/training_layer_plus_context.h | 50 ++ .../training_layer_plus_context.struct.toml | 29 + .../training_layer_tensor_group_signature.h | 20 + ...g_layer_tensor_group_signature.struct.toml | 19 + .../include/task-spec/training_tensor_group.h | 28 + .../training_tensor_group.struct.toml | 31 ++ .../training_tensor_group_with_attrs.h | 18 + ...aining_tensor_group_with_attrs.struct.toml | 37 ++ .../training_tensor_guid_t.variant.toml | 31 ++ .../{concrete_arg.cc => concrete_arg_spec.cc} | 2 +- .../src/task-spec/forward_tensor_source.cc | 18 + .../src/task-spec}/gradient_tensor_source.cc | 8 +- .../src/task-spec}/loss_functions.cc | 78 +-- .../src/task-spec/loss_tensor_source.cc | 13 + lib/task-spec/src/task-spec/op_arg_ref.cc | 27 +- .../task-spec/op_task_to_task_invocation.cc | 214 ++++--- lib/task-spec/src/task-spec/op_tensor_spec.cc | 12 +- lib/task-spec/src/task-spec/ops/attention.cc | 111 ++-- .../src/task-spec/ops/batch_matmul.cc | 88 +-- lib/task-spec/src/task-spec/ops/batch_norm.cc | 97 ++-- lib/task-spec/src/task-spec/ops/cast.cc | 26 +- lib/task-spec/src/task-spec/ops/combine.cc | 94 ---- lib/task-spec/src/task-spec/ops/concat.cc | 31 +- lib/task-spec/src/task-spec/ops/conv_2d.cc | 85 ++- lib/task-spec/src/task-spec/ops/dropout.cc | 72 ++- .../src/task-spec/ops/element_binary.cc | 71 ++- .../src/task-spec/ops/element_unary.cc | 63 ++- lib/task-spec/src/task-spec/ops/embedding.cc | 120 ++++ lib/task-spec/src/task-spec/ops/flat.cc | 26 +- lib/task-spec/src/task-spec/ops/gather.cc | 81 ++- lib/task-spec/src/task-spec/ops/layer_norm.cc | 82 ++- lib/task-spec/src/task-spec/ops/linear.cc | 158 +++--- lib/task-spec/src/task-spec/ops/pool_2d.cc | 83 ++- lib/task-spec/src/task-spec/ops/reduce.cc | 62 ++- lib/task-spec/src/task-spec/ops/reduction.cc | 101 ---- .../src/task-spec/ops/repartition.cc | 137 ----- lib/task-spec/src/task-spec/ops/replicate.cc | 99 ---- lib/task-spec/src/task-spec/ops/reshape.cc | 70 +-- lib/task-spec/src/task-spec/ops/reverse.cc | 25 +- lib/task-spec/src/task-spec/ops/softmax.cc | 87 ++- lib/task-spec/src/task-spec/ops/split.cc | 48 +- lib/task-spec/src/task-spec/ops/topk.cc | 79 +-- lib/task-spec/src/task-spec/ops/transpose.cc | 28 +- .../src/task-spec}/optimizer.cc | 85 +-- .../src/task-spec}/optimizer_tensor_source.cc | 8 +- lib/task-spec/src/task-spec/profiling.cc | 1 + .../src/task-spec/runtime_arg_config.cc | 30 + .../src/task-spec/runtime_arg_ref.cc | 20 +- .../src/task-spec/task_invocation.cc | 6 +- .../src/task-spec/task_signature_impl.cc | 83 +-- .../task-spec/training_computation_graph.cc | 183 ++++++ .../task-spec/training_layer_plus_context.cc | 122 ++++ .../training_layer_tensor_group_signature.cc | 31 ++ .../src/task-spec/training_tensor_group.cc | 48 ++ .../training_tensor_group_with_attrs.cc | 26 + .../src/task-spec/training_tensor_group.cc | 36 ++ .../training_tensor_group_with_attrs.cc | 84 +++ .../utils/archetypes/ordered_value_type.h | 10 + .../include/utils/containers/all_are_true.h | 17 + .../utils/containers/collapse_optionals.h | 19 + .../include/utils/containers/contains_value.h | 33 ++ .../include/utils/containers/filter_keys.h | 12 + lib/utils/include/utils/containers/filtrans.h | 6 +- lib/utils/include/utils/containers/flatmap.h | 11 + lib/utils/include/utils/exception.h | 3 +- lib/utils/include/utils/fmt/half.h | 26 + lib/utils/include/utils/fmt/set.h | 3 +- lib/utils/include/utils/{fp16.h => half.h} | 0 lib/utils/include/utils/json/half.h | 17 + .../utils/nonnegative_int/nonnegative_int.h | 1 + .../utils/nonnegative_int/nonnegative_range.h | 2 + lib/utils/include/utils/rapidcheck/half.h | 16 + .../include/utils/rapidcheck/monostate.h | 16 + .../include/utils/units/milliseconds_t.h | 67 +++ lib/utils/include/utils/units/num_bytes_t.h | 62 +++ lib/utils/src/{fp16.cc => half.cc} | 2 +- .../src/utils/containers/all_are_true.cc | 10 + .../utils/containers/collapse_optionals.cc | 11 + .../src/utils/containers/contains_value.cc | 13 + lib/utils/src/utils/containers/filtrans.cc | 11 + lib/utils/src/utils/fmt/half.cc | 9 + lib/utils/src/utils/fmt/set.cc | 15 + lib/utils/src/utils/json/half.cc | 13 + .../utils/nonnegative_int/nonnegative_int.cc | 33 +- .../nonnegative_int/nonnegative_range.cc | 4 + lib/utils/src/utils/rapidcheck/half.cc | 9 + lib/utils/src/utils/rapidcheck/monostate.cc | 9 + lib/utils/src/utils/units/milliseconds_t.cc | 94 ++++ lib/utils/src/utils/units/num_bytes_t.cc | 87 +++ .../include/test/utils/doctest/fmt/half.h | 16 + lib/utils/test/common/src/main.cc | 2 + .../common/src/test/utils/doctest/fmt/half.cc | 9 + .../test/src/utils/containers/all_are_true.cc | 36 ++ .../utils/containers/collapse_optionals.cc | 38 ++ .../src/utils/containers/contains_value.cc | 51 ++ .../src/utils/positive_int/positive_int.cc | 28 +- 601 files changed, 16269 insertions(+), 9723 deletions(-) create mode 100644 lib/compiler/include/compiler/cost_estimator/op_cost_metrics.h create mode 100644 lib/compiler/include/compiler/cost_estimator/runtime_only_cost_estimator.h create mode 100644 lib/compiler/include/compiler/cost_estimator/runtime_only_cost_estimator_from_cost_estimator.h create mode 100644 lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_estimate_key.h create mode 100644 lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_estimate_key.struct.toml create mode 100644 lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_metrics.h create mode 100644 lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_metrics.struct.toml create mode 100644 lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.h create mode 100644 lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.struct.toml create mode 100644 lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_context.struct.toml create mode 100644 lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_state.struct.toml create mode 100644 lib/compiler/src/compiler/cost_estimator/op_cost_metrics.cc create mode 100644 lib/compiler/src/compiler/cost_estimator/runtime_only_cost_estimator.cc create mode 100644 lib/compiler/src/compiler/cost_estimator/runtime_only_cost_estimator_from_cost_estimator.cc create mode 100644 lib/compiler/src/compiler/cost_estimator/runtime_only_op_cost_estimate_key.cc create mode 100644 lib/compiler/src/compiler/cost_estimator/runtime_only_op_cost_metrics.cc create mode 100644 lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.cc rename lib/compiler/test/src/{compiler => internal}/cost_estimator_for_test.cc (73%) rename lib/compiler/test/src/{compiler => internal}/cost_estimator_for_test.h (69%) create mode 100644 lib/compiler/test/src/internal/runtime_only_cost_estimator_for_test.cc create mode 100644 lib/compiler/test/src/internal/runtime_only_cost_estimator_for_test.h delete mode 100644 lib/kernels/include/kernels/array_coord.h delete mode 100644 lib/kernels/include/kernels/array_shape.h create mode 100644 lib/kernels/include/kernels/attention_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/attention_kernels_gpu.h create mode 100644 lib/kernels/include/kernels/batch_matmul_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/batch_matmul_kernels_gpu.h create mode 100644 lib/kernels/include/kernels/batch_norm_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/batch_norm_kernels_gpu.h create mode 100644 lib/kernels/include/kernels/cast_kernels_gpu.h delete mode 100644 lib/kernels/include/kernels/combine_kernels.h delete mode 100644 lib/kernels/include/kernels/combine_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/concat_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/concat_kernels_gpu.h create mode 100644 lib/kernels/include/kernels/conv_2d_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/conv_2d_kernels_gpu.h create mode 100644 lib/kernels/include/kernels/conv_2d_per_device_state.struct.toml create mode 100644 lib/kernels/include/kernels/create_local_allocator_for_device_type.h create mode 100644 lib/kernels/include/kernels/device_handle_t.h create mode 100644 lib/kernels/include/kernels/device_handle_t.variant.toml create mode 100644 lib/kernels/include/kernels/device_stream_t.h create mode 100644 lib/kernels/include/kernels/device_stream_t.variant.toml create mode 100644 lib/kernels/include/kernels/dropout_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/dropout_kernels_gpu.h create mode 100644 lib/kernels/include/kernels/dropout_per_device_state.struct.toml create mode 100644 lib/kernels/include/kernels/element_binary_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/element_binary_kernels_gpu.h create mode 100644 lib/kernels/include/kernels/element_binary_per_device_state.struct.toml create mode 100644 lib/kernels/include/kernels/element_unary_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/element_unary_kernels_gpu.h create mode 100644 lib/kernels/include/kernels/element_unary_per_device_state.struct.toml create mode 100644 lib/kernels/include/kernels/embedding_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/embedding_kernels_gpu.h create mode 100644 lib/kernels/include/kernels/flat_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/flat_kernels_gpu.h create mode 100644 lib/kernels/include/kernels/gather_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/gather_kernels_gpu.h create mode 100644 lib/kernels/include/kernels/gather_per_device_state.struct.toml create mode 100644 lib/kernels/include/kernels/layer_norm_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/layer_norm_kernels_gpu.h create mode 100644 lib/kernels/include/kernels/layer_norm_per_device_state.struct.toml create mode 100644 lib/kernels/include/kernels/linear_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/linear_kernels_gpu.h create mode 100644 lib/kernels/include/kernels/linear_per_device_state.struct.toml create mode 100644 lib/kernels/include/kernels/loss_function_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/loss_function_kernels_gpu.h create mode 100644 lib/kernels/include/kernels/mha_per_device_state.struct.toml create mode 100644 lib/kernels/include/kernels/optimizer_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/optimizer_kernels_gpu.h delete mode 100644 lib/kernels/include/kernels/partition_kernels.h create mode 100644 lib/kernels/include/kernels/partition_per_device_state.struct.toml delete mode 100644 lib/kernels/include/kernels/per_device_op_state.variant.toml create mode 100644 lib/kernels/include/kernels/pool_2d_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/pool_2d_kernels_gpu.h create mode 100644 lib/kernels/include/kernels/pool_2d_per_device_state.struct.toml create mode 100644 lib/kernels/include/kernels/reduce_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/reduce_kernels_gpu.h create mode 100644 lib/kernels/include/kernels/reduce_per_device_state.struct.toml delete mode 100644 lib/kernels/include/kernels/reduction_kernels.h delete mode 100644 lib/kernels/include/kernels/replicate_kernels.h delete mode 100644 lib/kernels/include/kernels/replicate_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/reshape_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/reshape_kernels_gpu.h create mode 100644 lib/kernels/include/kernels/reverse_kernels_gpu.h create mode 100644 lib/kernels/include/kernels/softmax_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/softmax_kernels_gpu.h create mode 100644 lib/kernels/include/kernels/softmax_per_device_state.struct.toml create mode 100644 lib/kernels/include/kernels/split_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/split_kernels_gpu.h create mode 100644 lib/kernels/include/kernels/tensor_accessor_binary_ops.h create mode 100644 lib/kernels/include/kernels/tensor_accessor_unary_ops.h create mode 100644 lib/kernels/include/kernels/topk_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/topk_kernels_gpu.h create mode 100644 lib/kernels/include/kernels/transpose_kernels_cpu.h create mode 100644 lib/kernels/include/kernels/transpose_kernels_gpu.h delete mode 100644 lib/kernels/src/cpu/ops/combine_kernels.cc delete mode 100644 lib/kernels/src/cpu/ops/replicate_kernels.cc delete mode 100644 lib/kernels/src/cuda/ops/combine_kernels.cu delete mode 100644 lib/kernels/src/cuda/ops/partition_kernels.cu delete mode 100644 lib/kernels/src/cuda/ops/reduction_kernels.cu delete mode 100644 lib/kernels/src/cuda/ops/replicate_kernels.cu delete mode 100644 lib/kernels/src/kernels/array_shape.cc create mode 100644 lib/kernels/src/kernels/attention_kernels.cc create mode 100644 lib/kernels/src/kernels/attention_kernels_cpu.cc create mode 100644 lib/kernels/src/kernels/batch_matmul_kernels.cc create mode 100644 lib/kernels/src/kernels/batch_matmul_kernels_cpu.cc create mode 100644 lib/kernels/src/kernels/batch_norm_kernels.cc create mode 100644 lib/kernels/src/kernels/batch_norm_kernels_cpu.cc create mode 100644 lib/kernels/src/kernels/cast_kernels.cc rename lib/kernels/src/{cpu/ops/cast_kernels.cc => kernels/cast_kernels_cpu.cc} (82%) create mode 100644 lib/kernels/src/kernels/concat_kernels.cc create mode 100644 lib/kernels/src/kernels/concat_kernels_cpu.cc create mode 100644 lib/kernels/src/kernels/conv_2d_kernels.cc create mode 100644 lib/kernels/src/kernels/conv_2d_kernels_cpu.cc create mode 100644 lib/kernels/src/kernels/create_local_allocator_for_device_type.cc create mode 100644 lib/kernels/src/kernels/device_handle_t.cc create mode 100644 lib/kernels/src/kernels/device_stream_t.cc create mode 100644 lib/kernels/src/kernels/dropout_kernels.cc create mode 100644 lib/kernels/src/kernels/dropout_kernels_cpu.cc create mode 100644 lib/kernels/src/kernels/element_binary_kernels.cc create mode 100644 lib/kernels/src/kernels/element_binary_kernels_cpu.cc create mode 100644 lib/kernels/src/kernels/element_unary_kernels.cc create mode 100644 lib/kernels/src/kernels/element_unary_kernels_cpu.cc create mode 100644 lib/kernels/src/kernels/embedding_kernels.cc create mode 100644 lib/kernels/src/kernels/embedding_kernels_cpu.cc create mode 100644 lib/kernels/src/kernels/flat_kernels.cc create mode 100644 lib/kernels/src/kernels/flat_kernels_cpu.cc create mode 100644 lib/kernels/src/kernels/gather_kernels.cc create mode 100644 lib/kernels/src/kernels/gather_kernels_cpu.cc create mode 100644 lib/kernels/src/kernels/layer_norm_kernels.cc create mode 100644 lib/kernels/src/kernels/layer_norm_kernels_cpu.cc create mode 100644 lib/kernels/src/kernels/linear_kernels.cc create mode 100644 lib/kernels/src/kernels/linear_kernels_cpu.cc create mode 100644 lib/kernels/src/kernels/loss_function_kernels.cc create mode 100644 lib/kernels/src/kernels/loss_function_kernels_cpu.cc create mode 100644 lib/kernels/src/kernels/optimizer_kernels.cc create mode 100644 lib/kernels/src/kernels/optimizer_kernels_cpu.cc create mode 100644 lib/kernels/src/kernels/pool_2d_kernels.cc create mode 100644 lib/kernels/src/kernels/pool_2d_kernels_cpu.cc create mode 100644 lib/kernels/src/kernels/reduce_kernels.cc create mode 100644 lib/kernels/src/kernels/reduce_kernels_cpu.cc create mode 100644 lib/kernels/src/kernels/reshape_kernels.cc create mode 100644 lib/kernels/src/kernels/reshape_kernels_cpu.cc create mode 100644 lib/kernels/src/kernels/reverse_kernels.cc rename lib/kernels/src/{cpu/ops/reverse_kernels.cc => kernels/reverse_kernels_cpu.cc} (64%) create mode 100644 lib/kernels/src/kernels/softmax_kernels.cc create mode 100644 lib/kernels/src/kernels/softmax_kernels_cpu.cc create mode 100644 lib/kernels/src/kernels/split_kernels.cc create mode 100644 lib/kernels/src/kernels/split_kernels_cpu.cc create mode 100644 lib/kernels/src/kernels/tensor_accessor_binary_ops.cc create mode 100644 lib/kernels/src/kernels/tensor_accessor_unary_ops.cc create mode 100644 lib/kernels/src/kernels/topk_kernels.cc create mode 100644 lib/kernels/src/kernels/topk_kernels_cpu.cc create mode 100644 lib/kernels/src/kernels/transpose_kernels.cc create mode 100644 lib/kernels/src/kernels/transpose_kernels_cpu.cc delete mode 100644 lib/kernels/test/src/cpu/ops/replicate_kernels.cc rename lib/kernels/{src => test/src/internal}/test_utils.cc (85%) rename lib/kernels/{include/kernels => test/src/internal}/test_utils.h (96%) delete mode 100644 lib/kernels/test/src/kernels/array_shape.cc create mode 100644 lib/kernels/test/src/kernels/linear_kernels.cc create mode 100644 lib/kernels/test/src/kernels/linear_kernels_cpu.cc rename lib/kernels/test/src/{cpu/ops/reverse_kernels.cc => kernels/reverse_kernels_cpu.cc} (99%) create mode 100644 lib/kernels/test/src/kernels/tensor_accessor_unary_ops.cc delete mode 100644 lib/kernels/test/src/test_combine_kernel.cc delete mode 100644 lib/kernels/test/src/test_managed_ff_stream.cc delete mode 100644 lib/kernels/test/src/test_partition_kernel.cc delete mode 100644 lib/kernels/test/src/test_reduction_kernel.cc delete mode 100644 lib/kernels/test/src/test_replicate_kernel.cc delete mode 100644 lib/local-execution/include/local-execution/allocated_tensors.h delete mode 100644 lib/local-execution/include/local-execution/allocated_tensors.struct.toml delete mode 100644 lib/local-execution/include/local-execution/cost_details.struct.toml delete mode 100644 lib/local-execution/include/local-execution/cost_estimate.h delete mode 100644 lib/local-execution/include/local-execution/cost_metrics.h delete mode 100644 lib/local-execution/include/local-execution/gradient_tensor_source.h create mode 100644 lib/local-execution/include/local-execution/local_args_backing.struct.toml create mode 100644 lib/local-execution/include/local-execution/local_task_registry.h create mode 100644 lib/local-execution/include/local-execution/local_task_registry.struct.toml create mode 100644 lib/local-execution/include/local-execution/local_training_backing.struct.toml create mode 100644 lib/local-execution/include/local-execution/operator_task_set.h create mode 100644 lib/local-execution/include/local-execution/operator_task_set.struct.toml delete mode 100644 lib/local-execution/include/local-execution/optimizer_tensor_source.h create mode 100644 lib/local-execution/include/local-execution/registered_task.h create mode 100644 lib/local-execution/include/local-execution/registered_task_t.variant.toml delete mode 100644 lib/local-execution/include/local-execution/task_registry.h delete mode 100644 lib/local-execution/include/local-execution/task_registry.struct.toml create mode 100644 lib/local-execution/include/local-execution/tensor_slot_backing.variant.toml delete mode 100644 lib/local-execution/include/local-execution/unallocated_tensors.h delete mode 100644 lib/local-execution/include/local-execution/unallocated_tensors.struct.toml delete mode 100644 lib/local-execution/src/allocated_tensors.cc create mode 100644 lib/local-execution/src/local-execution/local_args_backing.cc create mode 100644 lib/local-execution/src/local-execution/local_cost_estimator.cc create mode 100644 lib/local-execution/src/local-execution/local_task_registry.cc create mode 100644 lib/local-execution/src/local-execution/local_tensor_backing.cc create mode 100644 lib/local-execution/src/local-execution/local_training_backing.cc create mode 100644 lib/local-execution/src/local-execution/model_training_instance.cc create mode 100644 lib/local-execution/src/local-execution/operator_task_set.cc create mode 100644 lib/local-execution/src/local-execution/registered_task.cc delete mode 100644 lib/local-execution/src/local_args_backing.cc delete mode 100644 lib/local-execution/src/local_cost_estimator.cc delete mode 100644 lib/local-execution/src/local_tensor_backing.cc delete mode 100644 lib/local-execution/src/local_training_backing.cc delete mode 100644 lib/local-execution/src/loss_tensor_source.cc delete mode 100644 lib/local-execution/src/model_training_instance.cc delete mode 100644 lib/local-execution/src/task_registry.cc delete mode 100644 lib/local-execution/src/unallocated_tensors.cc rename lib/local-execution/test/src/{ => internal}/test_utils.cc (94%) rename lib/local-execution/test/src/{ => internal}/test_utils.h (100%) create mode 100644 lib/local-execution/test/src/local-execution/local_cost_estimator.cc rename lib/local-execution/test/src/{test_local_task_arg_accessor.cc => local-execution/local_task_argument_accessor.cc} (86%) create mode 100644 lib/local-execution/test/src/local-execution/local_task_registry.cc create mode 100644 lib/local-execution/test/src/local-execution/local_tensor_backing.cc rename lib/local-execution/test/src/{test_update.cc => local-execution/local_training_backing.cc} (68%) rename lib/local-execution/test/src/{test_loss_functions.cc => local-execution/loss_functions.cc} (54%) delete mode 100644 lib/local-execution/test/src/test_allocated_tensors.cc delete mode 100644 lib/local-execution/test/src/test_local_cost_estimator.cc delete mode 100644 lib/local-execution/test/src/test_local_tensor_backing.cc delete mode 100644 lib/local-execution/test/src/test_task_registry.cc delete mode 100644 lib/local-execution/test/src/test_unallocated_tensors.cc create mode 100644 lib/op-attrs/include/op-attrs/ff_ordered/filtrans.h create mode 100644 lib/op-attrs/include/op-attrs/ff_ordered/reversed.h create mode 100644 lib/op-attrs/include/op-attrs/ff_ordered/zip_with.cc create mode 100644 lib/op-attrs/include/op-attrs/ff_ordered/zip_with.h create mode 100644 lib/op-attrs/include/op-attrs/tensor_dims_coord.h rename lib/{kernels/include/kernels/array_coord.struct.toml => op-attrs/include/op-attrs/tensor_dims_coord.struct.toml} (74%) create mode 100644 lib/op-attrs/src/op-attrs/ff_ordered/filtrans.cc create mode 100644 lib/op-attrs/src/op-attrs/ff_ordered/reversed.cc rename lib/{kernels/src/kernels/array_coord.cc => op-attrs/src/op-attrs/tensor_dims_coord.cc} (53%) create mode 100644 lib/op-attrs/test/src/op-attrs/ff_ordered/reversed.cc create mode 100644 lib/op-attrs/test/src/op-attrs/ff_ordered/zip_with.cc rename lib/{kernels/test/src/kernels/array_coord.cc => op-attrs/test/src/op-attrs/tensor_dims_coord.cc} (59%) create mode 100644 lib/pcg/include/pcg/cg_operator_plus_signature.struct.toml create mode 100644 lib/pcg/include/pcg/cg_operator_tensor_shape_signature.h create mode 100644 lib/pcg/include/pcg/cg_operator_tensor_shape_signature.struct.toml create mode 100644 lib/pcg/include/pcg/pcg_operator_plus_signature.struct.toml create mode 100644 lib/pcg/include/pcg/pcg_operator_tensor_shape_signature.struct.toml rename lib/{task-spec/include/task-spec => pcg/include/pcg}/tensor_role.enum.toml (100%) create mode 100644 lib/pcg/src/pcg/cg_operator_tensor_shape_signature.cc delete mode 100644 lib/realm-backend/include/realm-backend/realm_args_backing.h delete mode 100644 lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h delete mode 100644 lib/realm-backend/include/realm-backend/realm_tensor_backing.h delete mode 100644 lib/realm-backend/include/realm-backend/realm_tensor_backing.struct.toml delete mode 100644 lib/realm-backend/src/realm_args_backing.cc delete mode 100644 lib/realm-backend/src/realm_task_argument_accessor.cc delete mode 100644 lib/realm-backend/src/realm_tensor_backing.cc create mode 100644 lib/realm-backend/src/realm_training_backing copy.cc delete mode 100644 lib/runtime/src/ops/embedding.cc rename lib/task-spec/include/task-spec/{concrete_arg.h => concrete_arg_spec.h} (89%) rename lib/task-spec/include/task-spec/{optimizer_tensor_t.struct.toml => forward_tensor_guid_t.struct.toml} (79%) create mode 100644 lib/task-spec/include/task-spec/forward_tensor_source.h rename lib/task-spec/include/task-spec/{gradient_tensor_t.struct.toml => gradient_tensor_guid_t.struct.toml} (78%) create mode 100644 lib/task-spec/include/task-spec/gradient_tensor_source.h rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/loss_functions.h (69%) rename lib/task-spec/include/task-spec/{loss_tensor_t.struct.toml => loss_tensor_guid_t.struct.toml} (87%) rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/loss_tensor_source.h (50%) create mode 100644 lib/task-spec/include/task-spec/op_task_binding.h create mode 100644 lib/task-spec/include/task-spec/op_task_invocation.struct.toml create mode 100644 lib/task-spec/include/task-spec/op_tensor_spec.struct.toml delete mode 100644 lib/task-spec/include/task-spec/ops/combine.h delete mode 100644 lib/task-spec/include/task-spec/ops/reduction.h delete mode 100644 lib/task-spec/include/task-spec/ops/repartition.h delete mode 100644 lib/task-spec/include/task-spec/ops/replicate.h rename lib/{local-execution/include/local-execution => task-spec/include/task-spec}/optimizer.h (51%) rename lib/task-spec/include/task-spec/{lowered_tensor_t.struct.toml => optimizer_tensor_guid_t.struct.toml} (78%) create mode 100644 lib/task-spec/include/task-spec/optimizer_tensor_source.h create mode 100644 lib/task-spec/include/task-spec/runtime_arg_config.struct.toml create mode 100644 lib/task-spec/include/task-spec/runtime_arg_ref_type.enum.toml rename lib/task-spec/include/task-spec/{slot_tensor_type_id.struct.toml => tensor_sub_slot_id_t.struct.toml} (90%) delete mode 100644 lib/task-spec/include/task-spec/tensor_type_t.variant.toml create mode 100644 lib/task-spec/include/task-spec/training_computation_graph.h create mode 100644 lib/task-spec/include/task-spec/training_computation_graph.struct.toml create mode 100644 lib/task-spec/include/task-spec/training_layer_plus_context.h create mode 100644 lib/task-spec/include/task-spec/training_layer_plus_context.struct.toml create mode 100644 lib/task-spec/include/task-spec/training_layer_tensor_group_signature.h create mode 100644 lib/task-spec/include/task-spec/training_layer_tensor_group_signature.struct.toml create mode 100644 lib/task-spec/include/task-spec/training_tensor_group.h create mode 100644 lib/task-spec/include/task-spec/training_tensor_group.struct.toml create mode 100644 lib/task-spec/include/task-spec/training_tensor_group_with_attrs.h create mode 100644 lib/task-spec/include/task-spec/training_tensor_group_with_attrs.struct.toml create mode 100644 lib/task-spec/include/task-spec/training_tensor_guid_t.variant.toml rename lib/task-spec/src/task-spec/{concrete_arg.cc => concrete_arg_spec.cc} (94%) create mode 100644 lib/task-spec/src/task-spec/forward_tensor_source.cc rename lib/{local-execution/src => task-spec/src/task-spec}/gradient_tensor_source.cc (55%) rename lib/{local-execution/src => task-spec/src/task-spec}/loss_functions.cc (66%) create mode 100644 lib/task-spec/src/task-spec/loss_tensor_source.cc delete mode 100644 lib/task-spec/src/task-spec/ops/combine.cc create mode 100644 lib/task-spec/src/task-spec/ops/embedding.cc delete mode 100644 lib/task-spec/src/task-spec/ops/reduction.cc delete mode 100644 lib/task-spec/src/task-spec/ops/repartition.cc delete mode 100644 lib/task-spec/src/task-spec/ops/replicate.cc rename lib/{local-execution/src => task-spec/src/task-spec}/optimizer.cc (76%) rename lib/{local-execution/src => task-spec/src/task-spec}/optimizer_tensor_source.cc (55%) create mode 100644 lib/task-spec/src/task-spec/profiling.cc create mode 100644 lib/task-spec/src/task-spec/runtime_arg_config.cc create mode 100644 lib/task-spec/src/task-spec/training_computation_graph.cc create mode 100644 lib/task-spec/src/task-spec/training_layer_plus_context.cc create mode 100644 lib/task-spec/src/task-spec/training_layer_tensor_group_signature.cc create mode 100644 lib/task-spec/src/task-spec/training_tensor_group.cc create mode 100644 lib/task-spec/src/task-spec/training_tensor_group_with_attrs.cc create mode 100644 lib/task-spec/test/src/task-spec/training_tensor_group.cc create mode 100644 lib/task-spec/test/src/task-spec/training_tensor_group_with_attrs.cc create mode 100644 lib/utils/include/utils/containers/all_are_true.h create mode 100644 lib/utils/include/utils/containers/collapse_optionals.h create mode 100644 lib/utils/include/utils/containers/contains_value.h create mode 100644 lib/utils/include/utils/fmt/half.h rename lib/utils/include/utils/{fp16.h => half.h} (100%) create mode 100644 lib/utils/include/utils/json/half.h create mode 100644 lib/utils/include/utils/rapidcheck/half.h create mode 100644 lib/utils/include/utils/rapidcheck/monostate.h create mode 100644 lib/utils/include/utils/units/milliseconds_t.h create mode 100644 lib/utils/include/utils/units/num_bytes_t.h rename lib/utils/src/{fp16.cc => half.cc} (87%) create mode 100644 lib/utils/src/utils/containers/all_are_true.cc create mode 100644 lib/utils/src/utils/containers/collapse_optionals.cc create mode 100644 lib/utils/src/utils/containers/contains_value.cc create mode 100644 lib/utils/src/utils/fmt/half.cc create mode 100644 lib/utils/src/utils/json/half.cc create mode 100644 lib/utils/src/utils/rapidcheck/half.cc create mode 100644 lib/utils/src/utils/rapidcheck/monostate.cc create mode 100644 lib/utils/src/utils/units/milliseconds_t.cc create mode 100644 lib/utils/src/utils/units/num_bytes_t.cc create mode 100644 lib/utils/test/common/include/test/utils/doctest/fmt/half.h create mode 100644 lib/utils/test/common/src/test/utils/doctest/fmt/half.cc create mode 100644 lib/utils/test/src/utils/containers/all_are_true.cc create mode 100644 lib/utils/test/src/utils/containers/collapse_optionals.cc create mode 100644 lib/utils/test/src/utils/containers/contains_value.cc diff --git a/.flake/pkgs/ffdb/ffdb.py b/.flake/pkgs/ffdb/ffdb.py index 84354ccd82..b5fc3956bf 100644 --- a/.flake/pkgs/ffdb/ffdb.py +++ b/.flake/pkgs/ffdb/ffdb.py @@ -5,3 +5,4 @@ gdb.execute(f'directory {get_config_root(Path.cwd())}') gdb.prompt_hook = lambda x: '(ffdb) ' gdb.execute('set history save on') +gdb.execute('catch throw') diff --git a/bin/export-model-arch/src/export_model_arch.cc b/bin/export-model-arch/src/export_model_arch.cc index 2dfbc275ec..82aebd2b2c 100644 --- a/bin/export-model-arch/src/export_model_arch.cc +++ b/bin/export-model-arch/src/export_model_arch.cc @@ -23,11 +23,11 @@ using namespace ::FlexFlow; ComputationGraph get_single_operator_computation_graph() { ComputationGraphBuilder b; - nonnegative_int batch_size = 8_n; - nonnegative_int in_channels = 16_n; - nonnegative_int out_channels = 12_n; + positive_int batch_size = 8_p; + positive_int in_channels = 16_p; + positive_int out_channels = 12_p; TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{ + TensorDims{FFOrdered{ batch_size, in_channels, out_channels, @@ -73,7 +73,7 @@ tl::expected } else if (model_name == "dlrm") { return get_dlrm_computation_graph(get_default_dlrm_config()); } else if (model_name == "split_test") { - nonnegative_int batch_size = 8_n; + positive_int batch_size = 8_p; return get_split_test_computation_graph(batch_size); } else if (model_name == "single_operator") { return get_single_operator_computation_graph(); diff --git a/flake.lock b/flake.lock index ff6e797d51..f016c47f45 100644 --- a/flake.lock +++ b/flake.lock @@ -66,17 +66,17 @@ ] }, "locked": { - "lastModified": 1746157536, - "narHash": "sha256-g4Hx/05+Ce3hl8OS1zm4pY/+ThD1blWKmcaPsohSX5Y=", - "owner": "lockshaw", - "repo": "proj", - "rev": "5871bc7b7fb9d7d7f14c8bca6c50a0cf2e75834d", - "type": "github" + "lastModified": 1752259929, + "narHash": "sha256-GkMRIi6Xk3qswrbekWtO1sQYz61mw25+62boDk1Gd7s=", + "ref": "refs/heads/master", + "rev": "669773600c781ab8b29ac2379d0c070721417f9d", + "revCount": 117, + "type": "git", + "url": "https://git.sr.ht/~lockshaw/proj" }, "original": { - "owner": "lockshaw", - "repo": "proj", - "type": "github" + "type": "git", + "url": "https://git.sr.ht/~lockshaw/proj" } }, "root": { diff --git a/flake.nix b/flake.nix index 5fa48fa3fd..474a22f385 100644 --- a/flake.nix +++ b/flake.nix @@ -18,7 +18,7 @@ flake-utils.url = "github:numtide/flake-utils"; proj-repo = { - url = "github:lockshaw/proj"; + url = "git+https://git.sr.ht/~lockshaw/proj"; inputs.nixpkgs.follows = "nixpkgs"; inputs.flake-utils.follows = "flake-utils"; }; @@ -121,6 +121,7 @@ lcov # for code coverage compdb gbenchmark + libtorch-bin ]) (with proj-repo.packages.${system}; [ proj @@ -177,6 +178,7 @@ frozendict black toml + numpy ]) (with self.packages.${system}; [ ffdb diff --git a/lib/compiler/include/compiler/cost_estimator/cost_estimator.h b/lib/compiler/include/compiler/cost_estimator/cost_estimator.h index ecaffa337b..7b7255a89d 100644 --- a/lib/compiler/include/compiler/cost_estimator/cost_estimator.h +++ b/lib/compiler/include/compiler/cost_estimator/cost_estimator.h @@ -13,7 +13,7 @@ namespace FlexFlow { struct ICostEstimator { virtual OpCostMetrics estimate_cost(OpCostEstimateKey const &) const = 0; - virtual float estimate_cost(TensorSetMovement const &) const = 0; + virtual milliseconds_t estimate_cost(TensorSetMovement const &) const = 0; ICostEstimator() = default; ICostEstimator(ICostEstimator const &) = delete; @@ -25,7 +25,7 @@ CHECK_RC_COPY_VIRTUAL_COMPLIANT(ICostEstimator); struct CostEstimator { OpCostMetrics estimate_cost(OpCostEstimateKey const &) const; - float estimate_cost(TensorSetMovement const &m) const; + milliseconds_t estimate_cost(TensorSetMovement const &m) const; template static typename std::enable_if::value, diff --git a/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.h b/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.h index 93a1143cde..d905abeb77 100644 --- a/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.h +++ b/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_OP_COST_ESTIMATE_KEY_H #include "compiler/cost_estimator/op_cost_estimate_key.dtg.h" +#include "compiler/cost_estimator/runtime_only_op_cost_estimate_key.dtg.h" #include "pcg/device_id_t.dtg.h" #include "pcg/machine_specification.dtg.h" #include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h" @@ -11,9 +12,17 @@ namespace FlexFlow { OpCostEstimateKey get_mapped_op_cost_estimate_key_for_layer( ParallelComputationGraph const &pcg, - parallel_layer_guid_t const &layer, + OptimizerAttrs const &optimizer_attrs, + parallel_layer_guid_t const ¶llel_layer_guid, MachineView const &machine_view); +RuntimeOnlyOpCostEstimateKey + runtime_only_from_op_cost_estimate_key(OpCostEstimateKey const &); + +OpCostEstimateKey make_op_cost_estimate_key_from_runtime_only( + RuntimeOnlyOpCostEstimateKey const &runtime_only, + OptimizerAttrs const &optimizer_attrs); + } // namespace FlexFlow #endif diff --git a/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.struct.toml b/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.struct.toml index 8fd860d00d..b153bd0072 100644 --- a/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.struct.toml +++ b/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.struct.toml @@ -12,6 +12,7 @@ includes = [ "op-attrs/parallel_tensor_shape.dtg.h", "", "pcg/machine_view.dtg.h", + "pcg/optimizer_attrs.dtg.h", ] src_includes = [ @@ -35,6 +36,10 @@ type = "std::vector<::FlexFlow::ParallelTensorShape>" name = "output_shapes" type = "std::vector<::FlexFlow::ParallelTensorShape>" +[[fields]] +name = "optimizer_attrs" +type = "::FlexFlow::OptimizerAttrs" + [[fields]] name = "machine_view" type = "::FlexFlow::MachineView" diff --git a/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.h b/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.h new file mode 100644 index 0000000000..f2d12aff71 --- /dev/null +++ b/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.h @@ -0,0 +1,15 @@ +#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_OP_COST_METRICS_H +#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_OP_COST_METRICS_H + +#include "compiler/cost_estimator/op_cost_metrics.dtg.h" +#include "compiler/cost_estimator/runtime_only_op_cost_metrics.dtg.h" + +namespace FlexFlow { + +OpCostMetrics make_op_cost_metrics_from_runtime_only( + RuntimeOnlyOpCostMetrics const &runtime_only, + num_bytes_t const &memory_usage); + +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml b/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml index 5e81d6c10e..7d0c7684a9 100644 --- a/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml +++ b/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml @@ -7,17 +7,18 @@ features = [ ] includes = [ - "utils/nonnegative_int/nonnegative_int.h" + "utils/units/milliseconds_t.h", + "utils/units/num_bytes_t.h", ] [[fields]] name = "forward_runtime" -type = "float" +type = "::FlexFlow::milliseconds_t" [[fields]] name = "backward_runtime" -type = "float" +type = "::FlexFlow::milliseconds_t" [[fields]] -name = "memory" -type = "::FlexFlow::nonnegative_int" +name = "memory_usage" +type = "::FlexFlow::num_bytes_t" diff --git a/lib/compiler/include/compiler/cost_estimator/runtime_only_cost_estimator.h b/lib/compiler/include/compiler/cost_estimator/runtime_only_cost_estimator.h new file mode 100644 index 0000000000..aa1c2d70b6 --- /dev/null +++ b/lib/compiler/include/compiler/cost_estimator/runtime_only_cost_estimator.h @@ -0,0 +1,52 @@ +#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_RUNTIME_ONLY_COST_ESTIMATOR_H +#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_RUNTIME_ONLY_COST_ESTIMATOR_H + +#include "compiler/cost_estimator/runtime_only_op_cost_estimate_key.dtg.h" +#include "compiler/cost_estimator/runtime_only_op_cost_metrics.dtg.h" +#include "compiler/cost_estimator/tensor_set_movement.dtg.h" +#include "op-attrs/parallel_tensor_shape.dtg.h" +#include "op-attrs/pcg_operator_attrs.dtg.h" +#include "pcg/machine_view.dtg.h" +#include + +namespace FlexFlow { + +struct IRuntimeOnlyCostEstimator { + virtual RuntimeOnlyOpCostMetrics + estimate_cost(RuntimeOnlyOpCostEstimateKey const &) const = 0; + virtual milliseconds_t estimate_cost(TensorSetMovement const &) const = 0; + + IRuntimeOnlyCostEstimator() = default; + IRuntimeOnlyCostEstimator(IRuntimeOnlyCostEstimator const &) = delete; + IRuntimeOnlyCostEstimator & + operator=(IRuntimeOnlyCostEstimator const &) = delete; + + virtual ~IRuntimeOnlyCostEstimator() = default; +}; +CHECK_RC_COPY_VIRTUAL_COMPLIANT(IRuntimeOnlyCostEstimator); + +struct RuntimeOnlyCostEstimator { + RuntimeOnlyOpCostMetrics + estimate_cost(RuntimeOnlyOpCostEstimateKey const &) const; + milliseconds_t estimate_cost(TensorSetMovement const &m) const; + + template + static typename std::enable_if< + std::is_base_of::value, + RuntimeOnlyCostEstimator>::type + create(Args &&...args) { + return RuntimeOnlyCostEstimator( + std::make_shared(std::forward(args)...)); + } + +private: + RuntimeOnlyCostEstimator( + std::shared_ptr implementation_ptr); + +private: + std::shared_ptr implementation_ptr; +}; + +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/include/compiler/cost_estimator/runtime_only_cost_estimator_from_cost_estimator.h b/lib/compiler/include/compiler/cost_estimator/runtime_only_cost_estimator_from_cost_estimator.h new file mode 100644 index 0000000000..5757560f9d --- /dev/null +++ b/lib/compiler/include/compiler/cost_estimator/runtime_only_cost_estimator_from_cost_estimator.h @@ -0,0 +1,28 @@ +#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_RUNTIME_ONLY_COST_ESTIMATOR_FROM_COST_ESTIMATOR_H +#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_RUNTIME_ONLY_COST_ESTIMATOR_FROM_COST_ESTIMATOR_H + +#include "compiler/cost_estimator/cost_estimator.h" +#include "compiler/cost_estimator/runtime_only_cost_estimator.h" + +namespace FlexFlow { + +struct RuntimeOnlyCostEstimatorFromCostEstimator final + : public IRuntimeOnlyCostEstimator { + RuntimeOnlyCostEstimatorFromCostEstimator() = delete; + RuntimeOnlyCostEstimatorFromCostEstimator( + CostEstimator const &cost_estimator); + + RuntimeOnlyOpCostMetrics + estimate_cost(RuntimeOnlyOpCostEstimateKey const &) const override; + milliseconds_t estimate_cost(TensorSetMovement const &) const override; + +private: + CostEstimator cost_estimator; +}; + +RuntimeOnlyCostEstimator + runtime_only_cost_estimator_from_cost_estimator(CostEstimator const &); + +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_estimate_key.h b/lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_estimate_key.h new file mode 100644 index 0000000000..fc3157d74a --- /dev/null +++ b/lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_estimate_key.h @@ -0,0 +1,18 @@ +#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_RUNTIME_ONLY_OP_COST_ESTIMATE_KEY_H +#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_RUNTIME_ONLY_OP_COST_ESTIMATE_KEY_H + +#include "compiler/cost_estimator/runtime_only_op_cost_estimate_key.dtg.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h" +#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h" + +namespace FlexFlow { + +RuntimeOnlyOpCostEstimateKey + get_mapped_runtime_only_op_cost_estimate_key_for_layer( + ParallelComputationGraph const &pcg, + parallel_layer_guid_t const ¶llel_layer_guid, + MachineView const &machine_view); + +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_estimate_key.struct.toml b/lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_estimate_key.struct.toml new file mode 100644 index 0000000000..94be6f6e69 --- /dev/null +++ b/lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_estimate_key.struct.toml @@ -0,0 +1,40 @@ +namespace = "FlexFlow" +name = "RuntimeOnlyOpCostEstimateKey" +features = [ + "eq", + "ord", + "fmt", + "hash", +] + +includes = [ + "op-attrs/pcg_operator_attrs.dtg.h", + "op-attrs/parallel_tensor_shape.dtg.h", + "", + "pcg/machine_view.dtg.h", +] + +src_includes = [ + "utils/hash/vector.h", + "utils/fmt/vector.h", +] + +[[fields]] +name = "op_attrs" +type = "::FlexFlow::PCGOperatorAttrs" + +[[fields]] +name = "input_shapes" +type = "std::vector<::FlexFlow::ParallelTensorShape>" + +[[fields]] +name = "weight_shapes" +type = "std::vector<::FlexFlow::ParallelTensorShape>" + +[[fields]] +name = "output_shapes" +type = "std::vector<::FlexFlow::ParallelTensorShape>" + +[[fields]] +name = "machine_view" +type = "::FlexFlow::MachineView" diff --git a/lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_metrics.h b/lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_metrics.h new file mode 100644 index 0000000000..6b4e34fd75 --- /dev/null +++ b/lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_metrics.h @@ -0,0 +1,14 @@ +#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_RUNTIME_ONLY_OP_COST_METRICS_H +#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_RUNTIME_ONLY_OP_COST_METRICS_H + +#include "compiler/cost_estimator/op_cost_metrics.dtg.h" +#include "compiler/cost_estimator/runtime_only_op_cost_metrics.dtg.h" + +namespace FlexFlow { + +RuntimeOnlyOpCostMetrics + runtime_only_from_op_cost_metrics(OpCostMetrics const &); + +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_metrics.struct.toml b/lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_metrics.struct.toml new file mode 100644 index 0000000000..65ac318f0e --- /dev/null +++ b/lib/compiler/include/compiler/cost_estimator/runtime_only_op_cost_metrics.struct.toml @@ -0,0 +1,19 @@ +namespace = "FlexFlow" +name = "RuntimeOnlyOpCostMetrics" +features = [ + "eq", + "fmt", + "hash", +] + +includes = [ + "utils/units/milliseconds_t.h", +] + +[[fields]] +name = "forward_runtime" +type = "::FlexFlow::milliseconds_t" + +[[fields]] +name = "backward_runtime" +type = "::FlexFlow::milliseconds_t" diff --git a/lib/compiler/include/compiler/machine_mapping/feasible_machine_mapping_result.struct.toml b/lib/compiler/include/compiler/machine_mapping/feasible_machine_mapping_result.struct.toml index e71cfc540f..8dda2d15ba 100644 --- a/lib/compiler/include/compiler/machine_mapping/feasible_machine_mapping_result.struct.toml +++ b/lib/compiler/include/compiler/machine_mapping/feasible_machine_mapping_result.struct.toml @@ -8,11 +8,12 @@ features = [ includes = [ "compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.dtg.h", + "utils/units/milliseconds_t.h", ] [[fields]] name = "runtime" -type = "float" +type = "::FlexFlow::milliseconds_t" [[fields]] name = "machine_mapping" diff --git a/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h index 62da90bfcb..2cd3f3e289 100644 --- a/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h +++ b/lib/compiler/include/compiler/machine_mapping/get_optimal_machine_mapping.h @@ -35,12 +35,12 @@ MachineMappingResult get_optimal_machine_mapping( MachineSpecification const &resources, MachineMappingConstraints const &constraints); -MachineMappingResult - get_optimal_machine_mapping(MachineMappingCache &result_cache, - MachineMappingContext const &, - UnmappedOpCostEstimateKey const &leaf, - MachineSpecification const &resources, - MachineMappingConstraints const &constraints); +MachineMappingResult get_optimal_machine_mapping( + MachineMappingCache &result_cache, + MachineMappingContext const &, + UnmappedRuntimeOnlyOpCostEstimateKey const &leaf, + MachineSpecification const &resources, + MachineMappingConstraints const &constraints); } // namespace FlexFlow diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_context.struct.toml b/lib/compiler/include/compiler/machine_mapping/machine_mapping_context.struct.toml index 81e26f491d..dd49aaa98a 100644 --- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_context.struct.toml +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_context.struct.toml @@ -3,16 +3,16 @@ name = "MachineMappingContext" features = [] includes = [ - "compiler/cost_estimator/cost_estimator.h", + "compiler/cost_estimator/runtime_only_cost_estimator.h", "pcg/machine_view.dtg.h", "pcg/machine_specification.dtg.h", - "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.dtg.h", + "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.dtg.h", ] [[fields]] name = "cost_estimator" -type = "::FlexFlow::CostEstimator" +type = "::FlexFlow::RuntimeOnlyCostEstimator" [[fields]] name = "allowed_machine_views" -type = "std::function(::FlexFlow::UnmappedOpCostEstimateKey const &, ::FlexFlow::MachineSpecification const &)>" +type = "std::function(::FlexFlow::UnmappedRuntimeOnlyOpCostEstimateKey const &, ::FlexFlow::MachineSpecification const &)>" diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h index 29e9e7c90b..65f7006b21 100644 --- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h @@ -10,15 +10,16 @@ namespace FlexFlow { -GenericBinarySPDecompositionTreeImplementation +GenericBinarySPDecompositionTreeImplementation< + MachineMappingProblemTree, + MMProblemTreeSeriesSplit, + MMProblemTreeParallelSplit, + UnmappedRuntimeOnlyOpCostEstimateKey> generic_binary_sp_impl_for_mm_problem_tree(); SPDecompositionTreeNodeType get_node_type(MachineMappingProblemTree const &); -std::unordered_multiset +std::unordered_multiset get_leaves(MachineMappingProblemTree const &); std::unordered_set get_all_leaf_paths(MachineMappingProblemTree const &); diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.variant.toml b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.variant.toml index 1949f143cb..808853994a 100644 --- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.variant.toml +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.variant.toml @@ -9,7 +9,7 @@ features = [ includes = [ "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_series_split.dtg.h", "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_parallel_split.dtg.h", - "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.dtg.h", + "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.dtg.h", ] [[values]] @@ -21,5 +21,5 @@ type = "::FlexFlow::MMProblemTreeParallelSplit" key = "parallel" [[values]] -type = "::FlexFlow::UnmappedOpCostEstimateKey" +type = "::FlexFlow::UnmappedRuntimeOnlyOpCostEstimateKey" key = "leaf" diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h index 9fbad4a1d0..cfffeee245 100644 --- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h @@ -3,13 +3,24 @@ #include "compiler/cost_estimator/op_cost_estimate_key.dtg.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.dtg.h" +#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.dtg.h" #include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h" #include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h" namespace FlexFlow { UnmappedOpCostEstimateKey get_unmapped_op_cost_estimate_key_for_layer( - ParallelComputationGraph const &, parallel_layer_guid_t const &); + ParallelComputationGraph const &pcg, + OptimizerAttrs const &optimizer_attrs, + parallel_layer_guid_t const ¶llel_layer_guid); + +UnmappedOpCostEstimateKey unmapped_op_cost_estimate_key_from_runtime_only( + UnmappedRuntimeOnlyOpCostEstimateKey const &runtime_only, + OptimizerAttrs const &optimizer_attrs); + +UnmappedRuntimeOnlyOpCostEstimateKey + runtime_only_from_unmapped_op_cost_estimate_key( + UnmappedOpCostEstimateKey const &runtime_only); OpCostEstimateKey map_unmapped_op_cost_estimate_key(UnmappedOpCostEstimateKey const &unmapped, diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.struct.toml b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.struct.toml index fe76683eb7..5dcfd33859 100644 --- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.struct.toml +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.struct.toml @@ -10,7 +10,7 @@ includes = [ "op-attrs/pcg_operator_attrs.dtg.h", "op-attrs/parallel_tensor_shape.dtg.h", "", - "pcg/machine_view.dtg.h", + "pcg/optimizer_attrs.dtg.h", ] src_includes = [ @@ -34,3 +34,6 @@ type = "std::vector<::FlexFlow::ParallelTensorShape>" name = "output_shapes" type = "std::vector<::FlexFlow::ParallelTensorShape>" +[[fields]] +name = "optimizer_attrs" +type = "::FlexFlow::OptimizerAttrs" diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.h new file mode 100644 index 0000000000..c1de7cb956 --- /dev/null +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.h @@ -0,0 +1,22 @@ +#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MACHINE_MAPPING_MACHINE_MAPPING_PROBLEM_TREE_UNMAPPED_RUNTIME_ONLY_OP_COST_ESTIMATE_KEY_H +#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_MACHINE_MAPPING_MACHINE_MAPPING_PROBLEM_TREE_UNMAPPED_RUNTIME_ONLY_OP_COST_ESTIMATE_KEY_H + +#include "compiler/cost_estimator/runtime_only_op_cost_estimate_key.dtg.h" +#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.dtg.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h" +#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h" + +namespace FlexFlow { + +UnmappedRuntimeOnlyOpCostEstimateKey + get_unmapped_runtime_only_op_cost_estimate_key_for_layer( + ParallelComputationGraph const &pcg, + parallel_layer_guid_t const ¶llel_layer_guid); + +RuntimeOnlyOpCostEstimateKey map_unmapped_runtime_only_op_cost_estimate_key( + UnmappedRuntimeOnlyOpCostEstimateKey const &unmapped, + MachineView const &machine_view); + +} // namespace FlexFlow + +#endif diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.struct.toml b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.struct.toml new file mode 100644 index 0000000000..e38ce06f03 --- /dev/null +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.struct.toml @@ -0,0 +1,34 @@ +namespace = "FlexFlow" +name = "UnmappedRuntimeOnlyOpCostEstimateKey" +features = [ + "eq", + "fmt", + "hash", +] + +includes = [ + "op-attrs/pcg_operator_attrs.dtg.h", + "op-attrs/parallel_tensor_shape.dtg.h", + "", +] + +src_includes = [ + "utils/hash/vector.h", + "utils/fmt/vector.h", +] + +[[fields]] +name = "op_attrs" +type = "::FlexFlow::PCGOperatorAttrs" + +[[fields]] +name = "input_shapes" +type = "std::vector<::FlexFlow::ParallelTensorShape>" + +[[fields]] +name = "weight_shapes" +type = "std::vector<::FlexFlow::ParallelTensorShape>" + +[[fields]] +name = "output_shapes" +type = "std::vector<::FlexFlow::ParallelTensorShape>" diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h index b21fea5f24..8924b1c110 100644 --- a/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h +++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping_result.h @@ -3,6 +3,7 @@ #include "compiler/machine_mapping/machine_mapping_result.dtg.h" #include "compiler/machine_mapping/parallel_split_transformation.dtg.h" +#include "utils/units/milliseconds_t.h" namespace FlexFlow { @@ -14,7 +15,7 @@ FeasibleMachineMappingResult require_feasible(MachineMappingResult const &); std::unordered_set const &); [[nodiscard]] MachineMappingResult - series_combine(float comm_cost, + series_combine(milliseconds_t comm_cost, MachineMappingResult const &pre_result, MachineMappingResult const &post_result, std::optional const @@ -28,7 +29,7 @@ FeasibleMachineMappingResult require_feasible(MachineMappingResult const &); MachineMappingResult const &m2); [[nodiscard]] MachineMappingResult - make_singleton_machine_mapping_result(float runtime, + make_singleton_machine_mapping_result(milliseconds_t runtime, MachineView const &machine_view); } // namespace FlexFlow diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h b/lib/compiler/include/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h index d176d298db..74c6aee851 100644 --- a/lib/compiler/include/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h +++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h @@ -3,11 +3,12 @@ #include "compiler/machine_mapping/machine_mapping_cache.dtg.h" #include "compiler/machine_mapping/machine_mapping_constraints.dtg.h" -#include "compiler/machine_mapping/machine_mapping_context.dtg.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_parallel_split.dtg.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/mm_problem_tree_series_split.dtg.h" +#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.dtg.h" #include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.dtg.h" +#include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_context.dtg.h" #include "compiler/machine_mapping/parallel_split_transformation.dtg.h" #include "pcg/machine_specification.dtg.h" @@ -15,14 +16,14 @@ namespace FlexFlow { MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory( MachineMappingWithMemoryCache &result_cache, - MachineMappingContext const &context, + MachineMappingWithMemoryContext const &context, MachineMappingProblemTree const &problem_tree, MachineSpecification const &resources, MachineMappingConstraints const &constraints); MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory( MachineMappingWithMemoryCache &result_cache, - MachineMappingContext const &context, + MachineMappingWithMemoryContext const &context, MMProblemTreeSeriesSplit const &series_split, MachineSpecification const &resources, MachineMappingConstraints const &constraints, @@ -31,15 +32,15 @@ MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory( MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory( MachineMappingWithMemoryCache &result_cache, - MachineMappingContext const &context, + MachineMappingWithMemoryContext const &context, MMProblemTreeParallelSplit const ¶llel_split, MachineSpecification const &resources, MachineMappingConstraints const &constraints); MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory( MachineMappingWithMemoryCache &result_cache, - MachineMappingContext const &, - UnmappedOpCostEstimateKey const &leaf, + MachineMappingWithMemoryContext const &context, + UnmappedRuntimeOnlyOpCostEstimateKey const &leaf, MachineSpecification const &resources, MachineMappingConstraints const &constraints); diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_context.struct.toml b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_context.struct.toml new file mode 100644 index 0000000000..9530697632 --- /dev/null +++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_context.struct.toml @@ -0,0 +1,23 @@ +namespace = "FlexFlow" +name = "MachineMappingWithMemoryContext" +features = [] + +includes = [ + "compiler/cost_estimator/cost_estimator.h", + "pcg/machine_view.dtg.h", + "pcg/machine_specification.dtg.h", + "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.dtg.h", + "pcg/optimizer_attrs.dtg.h", +] + +[[fields]] +name = "cost_estimator" +type = "::FlexFlow::CostEstimator" + +[[fields]] +name = "optimizer_attrs" +type = "::FlexFlow::OptimizerAttrs" + +[[fields]] +name = "allowed_machine_views" +type = "std::function(::FlexFlow::UnmappedRuntimeOnlyOpCostEstimateKey const &, ::FlexFlow::MachineSpecification const &)>" diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h index 0383376116..4cb865dece 100644 --- a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h +++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h @@ -19,7 +19,7 @@ namespace FlexFlow { MachineMappingWithMemoryResult const &); [[nodiscard]] MachineMappingWithMemoryResult - series_combine(float comm_cost, + series_combine(milliseconds_t comm_cost, MachineMappingWithMemoryResult const &pre_result, MachineMappingWithMemoryResult const &post_result, std::optional const diff --git a/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_state.struct.toml b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_state.struct.toml new file mode 100644 index 0000000000..77af129094 --- /dev/null +++ b/lib/compiler/include/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_state.struct.toml @@ -0,0 +1,30 @@ +namespace = "FlexFlow" +name = "MachineMappingWithMemoryState" +features = [ + "eq", + "hash", + "fmt", +] + +includes = [ + "pcg/machine_specification.dtg.h", + "compiler/machine_mapping/machine_mapping_constraints.dtg.h", + "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.dtg.h", + "pcg/optimizer_attrs.dtg.h", +] + +[[fields]] +name = "problem_tree" +type = "::FlexFlow::MachineMappingProblemTree" + +[[fields]] +name = "resources" +type = "::FlexFlow::MachineSpecification" + +[[fields]] +name = "constraints" +type = "::FlexFlow::MachineMappingConstraints" + +[[fields]] +name = "optimizer_attrs" +type = "::FlexFlow::OptimizerAttrs" diff --git a/lib/compiler/include/compiler/task_graph_simulator/pcg_task.variant.toml b/lib/compiler/include/compiler/task_graph_simulator/pcg_task.variant.toml index 13f2f17652..cb8490c861 100644 --- a/lib/compiler/include/compiler/task_graph_simulator/pcg_task.variant.toml +++ b/lib/compiler/include/compiler/task_graph_simulator/pcg_task.variant.toml @@ -7,12 +7,12 @@ features = [ ] includes = [ - "compiler/cost_estimator/op_cost_estimate_key.dtg.h", + "compiler/cost_estimator/runtime_only_op_cost_estimate_key.dtg.h", "compiler/cost_estimator/tensor_set_movement.dtg.h", ] [[values]] -type = "::FlexFlow::OpCostEstimateKey" +type = "::FlexFlow::RuntimeOnlyOpCostEstimateKey" key = "operator" [[values]] diff --git a/lib/compiler/include/compiler/task_graph_simulator/task_simulator.h b/lib/compiler/include/compiler/task_graph_simulator/task_simulator.h index b35733e419..9dadfdb155 100644 --- a/lib/compiler/include/compiler/task_graph_simulator/task_simulator.h +++ b/lib/compiler/include/compiler/task_graph_simulator/task_simulator.h @@ -1,15 +1,16 @@ #ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_TASK_SIMULATOR_H #define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_TASK_SIMULATOR_H -#include "compiler/cost_estimator/cost_estimator.h" +#include "compiler/cost_estimator/runtime_only_cost_estimator.h" #include "compiler/machine_mapping/machine_mapping.dtg.h" #include "pcg/machine_specification.dtg.h" #include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h" namespace FlexFlow { -float task_simulator_estimate_forward_pass_time( + +milliseconds_t task_simulator_estimate_forward_pass_time( ParallelComputationGraph const &pcg, - CostEstimator const &estimator, + RuntimeOnlyCostEstimator const &estimator, MachineMapping const &machine_mapping, MachineSpecification const &machine_spec); diff --git a/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc b/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc index 6ac6e3a8d6..37e7cc97fd 100644 --- a/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc +++ b/lib/compiler/src/compiler/cost_estimator/cost_estimator.cc @@ -9,7 +9,7 @@ OpCostMetrics CostEstimator::estimate_cost(OpCostEstimateKey const &k) const { return this->implementation_ptr->estimate_cost(k); } -float CostEstimator::estimate_cost(TensorSetMovement const &m) const { +milliseconds_t CostEstimator::estimate_cost(TensorSetMovement const &m) const { return this->implementation_ptr->estimate_cost(m); } diff --git a/lib/compiler/src/compiler/cost_estimator/op_cost_estimate_key.cc b/lib/compiler/src/compiler/cost_estimator/op_cost_estimate_key.cc index ef5775851f..92b07bbe23 100644 --- a/lib/compiler/src/compiler/cost_estimator/op_cost_estimate_key.cc +++ b/lib/compiler/src/compiler/cost_estimator/op_cost_estimate_key.cc @@ -14,10 +14,39 @@ namespace FlexFlow { OpCostEstimateKey get_mapped_op_cost_estimate_key_for_layer( ParallelComputationGraph const &pcg, - parallel_layer_guid_t const &layer, + OptimizerAttrs const &optimizer_attrs, + parallel_layer_guid_t const ¶llel_layer_guid, MachineView const &machine_view) { return map_unmapped_op_cost_estimate_key( - get_unmapped_op_cost_estimate_key_for_layer(pcg, layer), machine_view); + get_unmapped_op_cost_estimate_key_for_layer( + pcg, optimizer_attrs, parallel_layer_guid), + machine_view); +} + +RuntimeOnlyOpCostEstimateKey runtime_only_from_op_cost_estimate_key( + OpCostEstimateKey const &op_cost_estimate_key) { + + return RuntimeOnlyOpCostEstimateKey{ + /*op_attrs=*/op_cost_estimate_key.op_attrs, + /*input_shapes=*/op_cost_estimate_key.input_shapes, + /*weight_shapes=*/op_cost_estimate_key.weight_shapes, + /*output_shapes=*/op_cost_estimate_key.output_shapes, + /*machine_view=*/op_cost_estimate_key.machine_view, + }; +} + +OpCostEstimateKey make_op_cost_estimate_key_from_runtime_only( + RuntimeOnlyOpCostEstimateKey const &runtime_only, + OptimizerAttrs const &optimizer_attrs) { + + return OpCostEstimateKey{ + /*op_attrs=*/runtime_only.op_attrs, + /*input_shapes=*/runtime_only.input_shapes, + /*weight_shapes=*/runtime_only.weight_shapes, + /*output_shapes=*/runtime_only.output_shapes, + /*optimizer_attrs=*/optimizer_attrs, + /*machine_view=*/runtime_only.machine_view, + }; } } // namespace FlexFlow diff --git a/lib/compiler/src/compiler/cost_estimator/op_cost_metrics.cc b/lib/compiler/src/compiler/cost_estimator/op_cost_metrics.cc new file mode 100644 index 0000000000..2bca184419 --- /dev/null +++ b/lib/compiler/src/compiler/cost_estimator/op_cost_metrics.cc @@ -0,0 +1,16 @@ +#include "compiler/cost_estimator/op_cost_metrics.h" + +namespace FlexFlow { + +OpCostMetrics make_op_cost_metrics_from_runtime_only( + RuntimeOnlyOpCostMetrics const &runtime_only, + num_bytes_t const &memory_usage) { + + return OpCostMetrics{ + /*forward_runtime=*/runtime_only.forward_runtime, + /*backward_runtime=*/runtime_only.backward_runtime, + /*memory_usage=*/memory_usage, + }; +} + +} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/cost_estimator/runtime_only_cost_estimator.cc b/lib/compiler/src/compiler/cost_estimator/runtime_only_cost_estimator.cc new file mode 100644 index 0000000000..4dcb4c33fe --- /dev/null +++ b/lib/compiler/src/compiler/cost_estimator/runtime_only_cost_estimator.cc @@ -0,0 +1,19 @@ +#include "compiler/cost_estimator/runtime_only_cost_estimator.h" + +namespace FlexFlow { + +RuntimeOnlyCostEstimator::RuntimeOnlyCostEstimator( + std::shared_ptr implementation_ptr) + : implementation_ptr(implementation_ptr) {} + +RuntimeOnlyOpCostMetrics RuntimeOnlyCostEstimator::estimate_cost( + RuntimeOnlyOpCostEstimateKey const &k) const { + return this->implementation_ptr->estimate_cost(k); +} + +milliseconds_t + RuntimeOnlyCostEstimator::estimate_cost(TensorSetMovement const &m) const { + return this->implementation_ptr->estimate_cost(m); +} + +} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/cost_estimator/runtime_only_cost_estimator_from_cost_estimator.cc b/lib/compiler/src/compiler/cost_estimator/runtime_only_cost_estimator_from_cost_estimator.cc new file mode 100644 index 0000000000..74099e115c --- /dev/null +++ b/lib/compiler/src/compiler/cost_estimator/runtime_only_cost_estimator_from_cost_estimator.cc @@ -0,0 +1,45 @@ +#include "compiler/cost_estimator/runtime_only_cost_estimator_from_cost_estimator.h" +#include "compiler/cost_estimator/op_cost_estimate_key.h" +#include "compiler/cost_estimator/runtime_only_op_cost_metrics.h" + +namespace FlexFlow { + +RuntimeOnlyCostEstimatorFromCostEstimator:: + RuntimeOnlyCostEstimatorFromCostEstimator( + CostEstimator const &cost_estimator) + : cost_estimator(cost_estimator) {} + +RuntimeOnlyOpCostMetrics + RuntimeOnlyCostEstimatorFromCostEstimator::estimate_cost( + RuntimeOnlyOpCostEstimateKey const &runtime_only) const { + OptimizerAttrs fake_optimizer_attrs = OptimizerAttrs{ + SGDOptimizerAttrs{ + /*lr=*/0.0, + /*momentum=*/0.0, + /*nesterov=*/false, + /*weight_decay=*/0.0, + }, + }; + + OpCostEstimateKey op_cost_estimate_key = + make_op_cost_estimate_key_from_runtime_only(runtime_only, + fake_optimizer_attrs); + + OpCostMetrics op_cost_metrics = + this->cost_estimator.estimate_cost(op_cost_estimate_key); + + return runtime_only_from_op_cost_metrics(op_cost_metrics); +} + +milliseconds_t RuntimeOnlyCostEstimatorFromCostEstimator::estimate_cost( + TensorSetMovement const &tensor_set_movement) const { + return this->cost_estimator.estimate_cost(tensor_set_movement); +} + +RuntimeOnlyCostEstimator runtime_only_cost_estimator_from_cost_estimator( + CostEstimator const &cost_estimator) { + return RuntimeOnlyCostEstimator::create< + RuntimeOnlyCostEstimatorFromCostEstimator>(cost_estimator); +} + +} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/cost_estimator/runtime_only_op_cost_estimate_key.cc b/lib/compiler/src/compiler/cost_estimator/runtime_only_op_cost_estimate_key.cc new file mode 100644 index 0000000000..0c097b165e --- /dev/null +++ b/lib/compiler/src/compiler/cost_estimator/runtime_only_op_cost_estimate_key.cc @@ -0,0 +1,17 @@ +#include "compiler/cost_estimator/runtime_only_op_cost_estimate_key.h" +#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.h" + +namespace FlexFlow { + +RuntimeOnlyOpCostEstimateKey + get_mapped_runtime_only_op_cost_estimate_key_for_layer( + ParallelComputationGraph const &pcg, + parallel_layer_guid_t const ¶llel_layer_guid, + MachineView const &machine_view) { + return map_unmapped_runtime_only_op_cost_estimate_key( + get_unmapped_runtime_only_op_cost_estimate_key_for_layer( + pcg, parallel_layer_guid), + machine_view); +} + +} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/cost_estimator/runtime_only_op_cost_metrics.cc b/lib/compiler/src/compiler/cost_estimator/runtime_only_op_cost_metrics.cc new file mode 100644 index 0000000000..4cfd864de5 --- /dev/null +++ b/lib/compiler/src/compiler/cost_estimator/runtime_only_op_cost_metrics.cc @@ -0,0 +1,14 @@ +#include "compiler/cost_estimator/runtime_only_op_cost_metrics.h" + +namespace FlexFlow { + +RuntimeOnlyOpCostMetrics + runtime_only_from_op_cost_metrics(OpCostMetrics const &op_cost_metrics) { + + return RuntimeOnlyOpCostMetrics{ + /*forward_runtime=*/op_cost_metrics.forward_runtime, + /*backward_runtime=*/op_cost_metrics.backward_runtime, + }; +} + +} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc index 49d528e4ab..8ca033d0d6 100644 --- a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc +++ b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc @@ -6,6 +6,7 @@ #include "compiler/machine_mapping/machine_mapping_constraints.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h" +#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.h" #include "compiler/machine_mapping/machine_mapping_result.h" #include "compiler/machine_mapping/transitive_reduced_pcg.h" #include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.dtg.h" @@ -86,11 +87,11 @@ MachineMappingResult allowed = generate_map( boundary_layers, [&](BinaryTreePath const &l) -> std::unordered_set { - UnmappedOpCostEstimateKey leaf = + UnmappedRuntimeOnlyOpCostEstimateKey leaf = mm_problem_tree_get_subtree_at_path( MachineMappingProblemTree{series_split}, l) .value() - .get(); + .get(); return context.allowed_machine_views(leaf, resources); }); return transform( @@ -156,7 +157,7 @@ MachineMappingResult tensor_movement, /*pre_mapping=*/assigned_pre_machine_views, /*post_mapping=*/assigned_post_machine_views); - float cost_across_split = + milliseconds_t cost_across_split = context.cost_estimator.estimate_cost(comm_across_split); result = minimize_runtime(result, @@ -222,12 +223,12 @@ MachineMappingResult get_optimal_machine_mapping( get_mapping_with_minimal_runtime(parallel_results)); } -MachineMappingResult - get_optimal_machine_mapping(MachineMappingCache &result_cache, - MachineMappingContext const &context, - UnmappedOpCostEstimateKey const &leaf, - MachineSpecification const &resource, - MachineMappingConstraints const &constraints) { +MachineMappingResult get_optimal_machine_mapping( + MachineMappingCache &result_cache, + MachineMappingContext const &context, + UnmappedRuntimeOnlyOpCostEstimateKey const &leaf, + MachineSpecification const &resource, + MachineMappingConstraints const &constraints) { std::unordered_set candidates = [&] { std::optional machine_view = require_only_root(constraints); @@ -239,10 +240,11 @@ MachineMappingResult }(); auto get_mapping_result = [&](MachineView const &machine_view) { - OpCostEstimateKey mapped = - map_unmapped_op_cost_estimate_key(leaf, machine_view); - OpCostMetrics metrics = context.cost_estimator.estimate_cost(mapped); - float cost = metrics.forward_runtime + metrics.backward_runtime; + RuntimeOnlyOpCostEstimateKey mapped = + map_unmapped_runtime_only_op_cost_estimate_key(leaf, machine_view); + RuntimeOnlyOpCostMetrics metrics = + context.cost_estimator.estimate_cost(mapped); + milliseconds_t cost = metrics.forward_runtime + metrics.backward_runtime; return make_singleton_machine_mapping_result(cost, machine_view); }; diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc index 367af3701e..da6b7b91e5 100644 --- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc +++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc @@ -1,7 +1,7 @@ #include "compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h" #include "compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h" -#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h" +#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.h" #include "compiler/machine_mapping/transitive_reduced_pcg.h" #include "compiler/series_parallel/pcg/pcg_binary_sp_decomposition.h" #include "pcg/parallel_computation_graph/parallel_computation_graph.h" @@ -41,7 +41,8 @@ MachineMappingProblemTree get_machine_mapping_problem_tree( }, [&](parallel_layer_guid_t const &leaf) { return MachineMappingProblemTree{ - get_unmapped_op_cost_estimate_key_for_layer(pcg, leaf), + get_unmapped_runtime_only_op_cost_estimate_key_for_layer(pcg, + leaf), }; }, }); diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.cc index 1e39a7be19..09323b1800 100644 --- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.cc +++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.cc @@ -5,16 +5,17 @@ namespace FlexFlow { -GenericBinarySPDecompositionTreeImplementation +GenericBinarySPDecompositionTreeImplementation< + MachineMappingProblemTree, + MMProblemTreeSeriesSplit, + MMProblemTreeParallelSplit, + UnmappedRuntimeOnlyOpCostEstimateKey> generic_binary_sp_impl_for_mm_problem_tree() { return GenericBinarySPDecompositionTreeImplementation< MachineMappingProblemTree, MMProblemTreeSeriesSplit, MMProblemTreeParallelSplit, - UnmappedOpCostEstimateKey>{ + UnmappedRuntimeOnlyOpCostEstimateKey>{ /*series_get_left_child=*/[](MMProblemTreeSeriesSplit const &split) -> MachineMappingProblemTree const & { return split.get_left_child(); @@ -50,8 +51,8 @@ GenericBinarySPDecompositionTreeImplementation UnmappedOpCostEstimateKey const & { - return tree.get(); + -> UnmappedRuntimeOnlyOpCostEstimateKey const & { + return tree.get(); }, }; } @@ -65,13 +66,13 @@ SPDecompositionTreeNodeType [](MMProblemTreeParallelSplit const &) { return SPDecompositionTreeNodeType::PARALLEL; }, - [](UnmappedOpCostEstimateKey const &) { + [](UnmappedRuntimeOnlyOpCostEstimateKey const &) { return SPDecompositionTreeNodeType::NODE; }, }); } -std::unordered_multiset +std::unordered_multiset get_leaves(MachineMappingProblemTree const &tree) { return get_leaves(tree, generic_binary_sp_impl_for_mm_problem_tree()); } diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.cc index 990b287f8b..7659467b6e 100644 --- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.cc +++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.cc @@ -1,23 +1,39 @@ #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h" +#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.h" #include "pcg/parallel_computation_graph/parallel_computation_graph.h" #include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h" namespace FlexFlow { UnmappedOpCostEstimateKey get_unmapped_op_cost_estimate_key_for_layer( - ParallelComputationGraph const &pcg, parallel_layer_guid_t const &layer) { - auto get_tensor_shape = [&](parallel_tensor_guid_t const &t) { - return get_parallel_tensor_shape(pcg, t); - }; + ParallelComputationGraph const &pcg, + OptimizerAttrs const &optimizer_attrs, + parallel_layer_guid_t const &layer) { + return unmapped_op_cost_estimate_key_from_runtime_only( + get_unmapped_runtime_only_op_cost_estimate_key_for_layer(pcg, layer), + optimizer_attrs); +} +UnmappedOpCostEstimateKey unmapped_op_cost_estimate_key_from_runtime_only( + UnmappedRuntimeOnlyOpCostEstimateKey const &runtime_only, + OptimizerAttrs const &optimizer_attrs) { return UnmappedOpCostEstimateKey{ - /*op_attrs=*/pcg_get_op_attrs(pcg, layer), - /*input_shapes=*/ - transform(get_incoming_inputs(pcg, layer), get_tensor_shape), - /*weight_shapes=*/ - transform(get_incoming_weights(pcg, layer), get_tensor_shape), - /*output_shapes=*/ - transform(get_layer_outputs(pcg, layer), get_tensor_shape), + /*op_attrs=*/runtime_only.op_attrs, + /*input_shapes=*/runtime_only.input_shapes, + /*weight_shapes=*/runtime_only.weight_shapes, + /*output_shapes=*/runtime_only.output_shapes, + /*optimizer_attrs=*/optimizer_attrs, + }; +} + +UnmappedRuntimeOnlyOpCostEstimateKey + runtime_only_from_unmapped_op_cost_estimate_key( + UnmappedOpCostEstimateKey const &unmapped_op_cost_estimate_key) { + return UnmappedRuntimeOnlyOpCostEstimateKey{ + /*op_attrs=*/unmapped_op_cost_estimate_key.op_attrs, + /*input_shapes=*/unmapped_op_cost_estimate_key.input_shapes, + /*weight_shapes=*/unmapped_op_cost_estimate_key.weight_shapes, + /*output_shapes=*/unmapped_op_cost_estimate_key.output_shapes, }; } @@ -29,6 +45,7 @@ OpCostEstimateKey /*input_shapes=*/unmapped.input_shapes, /*weight_shapes=*/unmapped.weight_shapes, /*output_shapes=*/unmapped.output_shapes, + /*optimizer_attrs=*/unmapped.optimizer_attrs, /*machine_view=*/machine_view, }; } diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.cc new file mode 100644 index 0000000000..53155a9a9b --- /dev/null +++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.cc @@ -0,0 +1,39 @@ +#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.h" +#include "pcg/parallel_computation_graph/parallel_computation_graph.h" + +namespace FlexFlow { + +UnmappedRuntimeOnlyOpCostEstimateKey + get_unmapped_runtime_only_op_cost_estimate_key_for_layer( + ParallelComputationGraph const &pcg, + parallel_layer_guid_t const ¶llel_layer_guid) { + auto get_tensor_shape = [&](parallel_tensor_guid_t const &t) { + return get_parallel_tensor_shape(pcg, t); + }; + + return UnmappedRuntimeOnlyOpCostEstimateKey{ + /*op_attrs=*/pcg_get_op_attrs(pcg, parallel_layer_guid), + /*input_shapes=*/ + transform(get_incoming_inputs(pcg, parallel_layer_guid), + get_tensor_shape), + /*weight_shapes=*/ + transform(get_incoming_weights(pcg, parallel_layer_guid), + get_tensor_shape), + /*output_shapes=*/ + transform(get_layer_outputs(pcg, parallel_layer_guid), get_tensor_shape), + }; +} + +RuntimeOnlyOpCostEstimateKey map_unmapped_runtime_only_op_cost_estimate_key( + UnmappedRuntimeOnlyOpCostEstimateKey const &unmapped, + MachineView const &machine_view) { + return RuntimeOnlyOpCostEstimateKey{ + /*op_attrs=*/unmapped.op_attrs, + /*input_shapes=*/unmapped.input_shapes, + /*weight_shapes=*/unmapped.weight_shapes, + /*output_shapes=*/unmapped.output_shapes, + /*machine_view=*/machine_view, + }; +} + +} // namespace FlexFlow diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc index 3409f7f871..a370a6803d 100644 --- a/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc +++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping_result.cc @@ -32,7 +32,7 @@ FeasibleMachineMappingResult } MachineMappingResult - series_combine(float comm_cost, + series_combine(milliseconds_t comm_cost, MachineMappingResult const &maybe_pre_result, MachineMappingResult const &maybe_post_result, std::optional const @@ -122,7 +122,7 @@ MachineMappingResult minimize_runtime(MachineMappingResult const &maybe_m1, } MachineMappingResult - make_singleton_machine_mapping_result(float runtime, + make_singleton_machine_mapping_result(milliseconds_t runtime, MachineView const &machine_view) { return MachineMappingResult{ FeasibleMachineMappingResult{ diff --git a/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc index b67083e8cd..74e8db6304 100644 --- a/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc +++ b/lib/compiler/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc @@ -26,7 +26,7 @@ namespace FlexFlow { MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory( MachineMappingWithMemoryCache &result_cache, - MachineMappingContext const &context, + MachineMappingWithMemoryContext const &context, MachineMappingProblemTree const &problem_tree, MachineSpecification const &resources, MachineMappingConstraints const &constraints) { @@ -71,7 +71,7 @@ MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory( MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory( MachineMappingWithMemoryCache &result_cache, - MachineMappingContext const &context, + MachineMappingWithMemoryContext const &context, MMProblemTreeSeriesSplit const &series_split, MachineSpecification const &resources, MachineMappingConstraints const &constraints, @@ -85,11 +85,11 @@ MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory( allowed = generate_map( boundary_layers, [&](BinaryTreePath const &l) -> std::unordered_set { - UnmappedOpCostEstimateKey leaf = + UnmappedRuntimeOnlyOpCostEstimateKey leaf = mm_problem_tree_get_subtree_at_path( MachineMappingProblemTree{series_split}, l) .value() - .get(); + .get(); return context.allowed_machine_views(leaf, resources); }); return transform( @@ -158,7 +158,7 @@ MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory( tensor_movement, /*pre_mapping=*/assigned_pre_machine_views, /*post_mapping=*/assigned_post_machine_views); - float cost_across_split = + milliseconds_t cost_across_split = context.cost_estimator.estimate_cost(comm_across_split); result = minimize_runtime(result, @@ -174,7 +174,7 @@ MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory( MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory( MachineMappingWithMemoryCache &result_cache, - MachineMappingContext const &context, + MachineMappingWithMemoryContext const &context, MMProblemTreeParallelSplit const ¶llel_split, MachineSpecification const &resources, MachineMappingConstraints const &constraints) { @@ -232,8 +232,8 @@ MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory( MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory( MachineMappingWithMemoryCache &result_cache, - MachineMappingContext const &context, - UnmappedOpCostEstimateKey const &leaf, + MachineMappingWithMemoryContext const &context, + UnmappedRuntimeOnlyOpCostEstimateKey const &leaf, MachineSpecification const &resource, MachineMappingConstraints const &constraints) { @@ -247,8 +247,10 @@ MachineMappingWithMemoryResult get_optimal_machine_mapping_with_memory( }(); auto get_mapping_result = [&](MachineView const &machine_view) { - OpCostEstimateKey mapped = - map_unmapped_op_cost_estimate_key(leaf, machine_view); + OpCostEstimateKey mapped = map_unmapped_op_cost_estimate_key( + unmapped_op_cost_estimate_key_from_runtime_only( + leaf, context.optimizer_attrs), + machine_view); OpCostMetrics cost = context.cost_estimator.estimate_cost(mapped); return make_singleton_machine_mapping_with_memory_result(cost, diff --git a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc index 9b4a1fd6fe..cff7984897 100644 --- a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc +++ b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc @@ -33,7 +33,7 @@ MachineMappingWithMemoryResult remove_non_pareto_optimal_machine_mapping_result( if (mapping.cost.forward_runtime >= other_mapping.cost.forward_runtime && mapping.cost.backward_runtime >= other_mapping.cost.backward_runtime && - mapping.cost.memory >= other_mapping.cost.memory && + mapping.cost.memory_usage >= other_mapping.cost.memory_usage && mapping != other_mapping) { is_pareto_optimal = false; break; @@ -47,7 +47,7 @@ MachineMappingWithMemoryResult remove_non_pareto_optimal_machine_mapping_result( } MachineMappingWithMemoryResult - series_combine(float comm_cost, + series_combine(milliseconds_t comm_cost, MachineMappingWithMemoryResult const &pre_result, MachineMappingWithMemoryResult const &post_result, std::optional const @@ -56,11 +56,12 @@ MachineMappingWithMemoryResult [&](MachineMappingForSingleLayer const &pre_mm, MachineMappingForSingleLayer const &post_mm) { OpCostMetrics cost = OpCostMetrics{ - pre_mm.cost.forward_runtime + comm_cost + + /*forward_runtime=*/pre_mm.cost.forward_runtime + comm_cost + post_mm.cost.forward_runtime, - pre_mm.cost.backward_runtime + comm_cost + + /*backward_runtime=*/pre_mm.cost.backward_runtime + comm_cost + post_mm.cost.backward_runtime, - pre_mm.cost.memory + post_mm.cost.memory, + /*memory_usage=*/pre_mm.cost.memory_usage + + post_mm.cost.memory_usage, }; ParallelLayerGuidObliviousMachineMapping mapping = [&] { @@ -98,10 +99,13 @@ MachineMappingWithMemoryResult [&](MachineMappingForSingleLayer const &lhs_mm, MachineMappingForSingleLayer const &rhs_mm) { OpCostMetrics cost = OpCostMetrics{ + /*forward_runtime=*/ std::max(lhs_mm.cost.forward_runtime, rhs_mm.cost.forward_runtime), + /*backward_runtime=*/ std::max(lhs_mm.cost.backward_runtime, - rhs_mm.cost.backward_runtime), //(@wmdi) is this correct? - std::max(lhs_mm.cost.memory, rhs_mm.cost.memory), + rhs_mm.cost.backward_runtime), + /*memory_usage=*/ + std::max(lhs_mm.cost.memory_usage, rhs_mm.cost.memory_usage), }; ParallelLayerGuidObliviousMachineMapping mapping = diff --git a/lib/compiler/src/compiler/task_graph_simulator/pcg_task_graph.cc b/lib/compiler/src/compiler/task_graph_simulator/pcg_task_graph.cc index 539c44a963..c072b0e61e 100644 --- a/lib/compiler/src/compiler/task_graph_simulator/pcg_task_graph.cc +++ b/lib/compiler/src/compiler/task_graph_simulator/pcg_task_graph.cc @@ -1,5 +1,5 @@ #include "compiler/task_graph_simulator/pcg_task_graph.h" -#include "compiler/cost_estimator/op_cost_estimate_key.h" +#include "compiler/cost_estimator/runtime_only_op_cost_estimate_key.h" #include "compiler/cost_estimator/tensor_set_movement.h" #include "compiler/machine_mapping/machine_mapping.dtg.h" #include "pcg/device_id_t.dtg.h" @@ -28,8 +28,8 @@ PCGTaskGraph get_pcg_task_graph(ParallelComputationGraph const &pcg, for (parallel_layer_guid_t const &layer : get_parallel_layers(pcg)) { MachineView mv = machine_mapping.machine_views.at(layer); - OpCostEstimateKey op_key = - get_mapped_op_cost_estimate_key_for_layer(pcg, layer, mv); + RuntimeOnlyOpCostEstimateKey op_key = + get_mapped_runtime_only_op_cost_estimate_key_for_layer(pcg, layer, mv); Node node = digraph.add_node(); node_to_task.equate(node, PCGTask{op_key}); node_to_layer.equate(node, layer); diff --git a/lib/compiler/src/compiler/task_graph_simulator/task_graph_execution_trace.cc b/lib/compiler/src/compiler/task_graph_simulator/task_graph_execution_trace.cc index 716a7afe15..1e15931174 100644 --- a/lib/compiler/src/compiler/task_graph_simulator/task_graph_execution_trace.cc +++ b/lib/compiler/src/compiler/task_graph_simulator/task_graph_execution_trace.cc @@ -8,10 +8,8 @@ namespace FlexFlow { float get_total_execution_time(TaskGraphExecutionTrace const &trace) { - if (trace.task_profiles.empty()) { - throw mk_runtime_error( - fmt::format("TaskGraphExecutionTrace {} is empty", trace)); - } + ASSERT(!trace.task_profiles.empty()); + float end_time = maximum(transform(trace.task_profiles, [](TaskProfile const &profile) { return profile.end_time; diff --git a/lib/compiler/src/compiler/task_graph_simulator/task_simulator.cc b/lib/compiler/src/compiler/task_graph_simulator/task_simulator.cc index ab204e7d71..a1aa53885b 100644 --- a/lib/compiler/src/compiler/task_graph_simulator/task_simulator.cc +++ b/lib/compiler/src/compiler/task_graph_simulator/task_simulator.cc @@ -19,9 +19,9 @@ namespace FlexFlow { -float task_simulator_estimate_forward_pass_time( +milliseconds_t task_simulator_estimate_forward_pass_time( ParallelComputationGraph const &pcg, - CostEstimator const &estimator, + RuntimeOnlyCostEstimator const &estimator, MachineMapping const &machine_mapping, MachineSpecification const &machine_spec) { @@ -30,11 +30,16 @@ float task_simulator_estimate_forward_pass_time( auto cost_function = [&](Node const &node) -> float { PCGTask task = task_graph.node_to_task.at_l(node); - if (task.is_operator()) { - return estimator.estimate_cost(task.require_operator()).forward_runtime; - } else { - return estimator.estimate_cost(task.require_tensor_movement()); - } + + milliseconds_t running_time = [&] { + if (task.is_operator()) { + return estimator.estimate_cost(task.require_operator()).forward_runtime; + } else { + return estimator.estimate_cost(task.require_tensor_movement()); + } + }(); + + return running_time.unwrap_milliseconds(); }; auto is_allowed_to_run = @@ -64,8 +69,8 @@ float task_simulator_estimate_forward_pass_time( TaskExecutionConstraint constraint = TaskExecutionConstraint{is_allowed_to_run}; - return get_total_execution_time(simulate_task_graph_execution( - task_graph.graph, cost_function, constraint)); + return milliseconds_t{get_total_execution_time(simulate_task_graph_execution( + task_graph.graph, cost_function, constraint))}; } } // namespace FlexFlow diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc index c3342c1b3a..2cbc87cffe 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc @@ -1,10 +1,13 @@ #include "compiler/machine_mapping/get_optimal_machine_mapping.h" -#include "../cost_estimator_for_test.h" +#include "compiler/cost_estimator/runtime_only_op_cost_estimate_key.dtg.h" +#include "compiler/cost_estimator/runtime_only_op_cost_metrics.dtg.h" #include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.h" #include "compiler/machine_mapping/machine_mapping_cache.h" #include "compiler/machine_mapping/machine_mapping_constraints.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h" +#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.h" +#include "internal/runtime_only_cost_estimator_for_test.h" #include "op-attrs/parallel_tensor_shape.h" #include "pcg/machine_view.h" #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h" @@ -17,7 +20,7 @@ using namespace FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_optimal_machine_mapping") { - auto make_leaf = [](UnmappedOpCostEstimateKey const &k) { + auto make_leaf = [](UnmappedRuntimeOnlyOpCostEstimateKey const &k) { return MachineMappingProblemTree{k}; }; @@ -90,14 +93,15 @@ TEST_SUITE(FF_TEST_SUITE) { /*intra_node_bandwidth=*/1, }; - auto allowed_machine_views1 = [&](UnmappedOpCostEstimateKey const &, - MachineSpecification const &resources) { - if (resources == full_machine_spec) { - return std::unordered_set{mv1, mv2}; - } else { - return std::unordered_set{mv2}; - } - }; + auto allowed_machine_views1 = + [&](UnmappedRuntimeOnlyOpCostEstimateKey const &, + MachineSpecification const &resources) { + if (resources == full_machine_spec) { + return std::unordered_set{mv1, mv2}; + } else { + return std::unordered_set{mv2}; + } + }; TensorShape tensor_shape = TensorShape{ TensorDims{ @@ -109,24 +113,26 @@ TEST_SUITE(FF_TEST_SUITE) { DataType::FLOAT, }; - UnmappedOpCostEstimateKey k1 = UnmappedOpCostEstimateKey{ - /*op_attrs=*/PCGOperatorAttrs{InputAttrs{tensor_shape}}, - /*input_shapes=*/{}, - /*weight_shapes=*/{}, - /*output_shapes=*/{}, - }; + UnmappedRuntimeOnlyOpCostEstimateKey k1 = + UnmappedRuntimeOnlyOpCostEstimateKey{ + /*op_attrs=*/PCGOperatorAttrs{InputAttrs{tensor_shape}}, + /*input_shapes=*/{}, + /*weight_shapes=*/{}, + /*output_shapes=*/{}, + }; - UnmappedOpCostEstimateKey k2 = UnmappedOpCostEstimateKey{ - /*op_attrs=*/PCGOperatorAttrs{ElementBinaryAttrs{ - /*type=*/OperatorType::EW_ADD, - /*compute_type=*/DataType::FLOAT, - /*should_broadcast_lhs=*/false, - /*should_broadcast_rhs=*/false, - }}, - /*input_shapes=*/{}, - /*weight_shapes=*/{}, - /*output_shapes=*/{}, - }; + UnmappedRuntimeOnlyOpCostEstimateKey k2 = + UnmappedRuntimeOnlyOpCostEstimateKey{ + /*op_attrs=*/PCGOperatorAttrs{ElementBinaryAttrs{ + /*type=*/OperatorType::EW_ADD, + /*compute_type=*/DataType::FLOAT, + /*should_broadcast_lhs=*/false, + /*should_broadcast_rhs=*/false, + }}, + /*input_shapes=*/{}, + /*weight_shapes=*/{}, + /*output_shapes=*/{}, + }; ParallelTensorShape par_tensor_shape = lift_to_parallel(tensor_shape); @@ -147,41 +153,39 @@ TEST_SUITE(FF_TEST_SUITE) { {binary_tree_root_path(), mv2}, }}; - auto map1 = std::unordered_map{{ - {map_unmapped_op_cost_estimate_key(k1, mv1), - OpCostMetrics{/*forward_runtime=*/0.5, - /*backward_runtime=*/0.5, - /*memory=*/nonnegative_int{0}}}, - {map_unmapped_op_cost_estimate_key(k2, mv1), - OpCostMetrics{/*forward_runtime=*/1.0, - /*backward_runtime=*/1.0, - /*memory=*/nonnegative_int{0}}}, - {map_unmapped_op_cost_estimate_key(k1, mv2), - OpCostMetrics{/*forward_runtime=*/0.75, - /*backward_runtime=*/0.75, - /*memory=*/nonnegative_int{0}}}, - {map_unmapped_op_cost_estimate_key(k2, mv2), - OpCostMetrics{/*forward_runtime=*/1.25, - /*backward_runtime=*/1.25, - /*memory=*/nonnegative_int{0}}}, + auto map1 = std::unordered_map{{ + {map_unmapped_runtime_only_op_cost_estimate_key(k1, mv1), + RuntimeOnlyOpCostMetrics{/*forward_runtime=*/0.5_ms, + /*backward_runtime=*/0.5_ms}}, + {map_unmapped_runtime_only_op_cost_estimate_key(k2, mv1), + RuntimeOnlyOpCostMetrics{/*forward_runtime=*/1.0_ms, + /*backward_runtime=*/1.0_ms}}, + {map_unmapped_runtime_only_op_cost_estimate_key(k1, mv2), + RuntimeOnlyOpCostMetrics{/*forward_runtime=*/0.75_ms, + /*backward_runtime=*/0.75_ms}}, + {map_unmapped_runtime_only_op_cost_estimate_key(k2, mv2), + RuntimeOnlyOpCostMetrics{/*forward_runtime=*/1.25_ms, + /*backward_runtime=*/1.25_ms}}, }}; - CostEstimator cost_estimator = make_fake_cost_estimator( - map1, - std::unordered_map{{ - {TensorSetMovement{{}}, 0.0}, - {concretize_abstracted_tensor_set_movement(movement1, mm1, mm1), - 0.1}, - {concretize_abstracted_tensor_set_movement(movement1, mm2, mm2), - 0.2}, - {concretize_abstracted_tensor_set_movement(movement1, mm1, mm2), - 0.3}, - {concretize_abstracted_tensor_set_movement(movement1, mm2, mm1), - 0.4}, - }}); + RuntimeOnlyCostEstimator runtime_only_cost_estimator = + make_fake_runtime_only_cost_estimator( + map1, + std::unordered_map{{ + {TensorSetMovement{{}}, 0.0_ms}, + {concretize_abstracted_tensor_set_movement(movement1, mm1, mm1), + 0.1_ms}, + {concretize_abstracted_tensor_set_movement(movement1, mm2, mm2), + 0.2_ms}, + {concretize_abstracted_tensor_set_movement(movement1, mm1, mm2), + 0.3_ms}, + {concretize_abstracted_tensor_set_movement(movement1, mm2, mm1), + 0.4_ms}, + }}); MachineMappingContext context = MachineMappingContext{ - cost_estimator, + runtime_only_cost_estimator, allowed_machine_views1, }; @@ -198,7 +202,7 @@ TEST_SUITE(FF_TEST_SUITE) { cache, context, problem_tree, full_machine_spec, constraints); MachineMappingResult correct = MachineMappingResult{ FeasibleMachineMappingResult{ - /*runtime=*/1.0, + /*runtime=*/1.0_ms, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ {binary_tree_root_path(), mv1}, @@ -221,7 +225,7 @@ TEST_SUITE(FF_TEST_SUITE) { cache, context, problem_tree, full_machine_spec, constraints); MachineMappingResult correct = MachineMappingResult{ FeasibleMachineMappingResult{ - /*runtime=*/1.0 + 2.0 + 0.1, + /*runtime=*/1.0_ms + 2.0_ms + 0.1_ms, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ { @@ -255,7 +259,7 @@ TEST_SUITE(FF_TEST_SUITE) { cache, context, problem_tree, full_machine_spec, constraints); MachineMappingResult correct = MachineMappingResult{ FeasibleMachineMappingResult{ - /*runtime=*/2.5, + /*runtime=*/2.5_ms, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ { diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc b/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc index c5b68e3a76..586a2b7764 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc @@ -1,6 +1,6 @@ #include "compiler/machine_mapping/get_tensor_set_movement_across_split.h" -#include "../cost_estimator_for_test.h" #include "compiler/machine_mapping/transitive_reduced_pcg.h" +#include "internal/cost_estimator_for_test.h" #include "pcg/machine_view.h" #include "pcg/parallel_computation_graph/parallel_computation_graph.h" #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h" diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc index d2c829df30..2fcffac29a 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc @@ -1,5 +1,6 @@ #include "compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h" +#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_runtime_only_op_cost_estimate_key.dtg.h" #include "op-attrs/parallel_tensor_shape.h" #include "pcg/parallel_computation_graph/parallel_computation_graph.h" #include "utils/containers/get_only.h" @@ -33,9 +34,10 @@ TEST_SUITE(FF_TEST_SUITE) { }; }; - auto mm_problem_tree_make_leaf = [](UnmappedOpCostEstimateKey const &k) { - return MachineMappingProblemTree{k}; - }; + auto mm_problem_tree_make_leaf = + [](UnmappedRuntimeOnlyOpCostEstimateKey const &k) { + return MachineMappingProblemTree{k}; + }; auto mm_problem_tree_make_series = [](AbstractedTensorSetMovement const &tensor_set_movement, @@ -92,7 +94,7 @@ TEST_SUITE(FF_TEST_SUITE) { auto make_input_key = [&](ParallelTensorShape const ¶llel_tensor_shape) { - return UnmappedOpCostEstimateKey{ + return UnmappedRuntimeOnlyOpCostEstimateKey{ /*op_attrs=*/input_attrs, /*input_shapes=*/{}, /*weight_shapes=*/{}, @@ -108,7 +110,8 @@ TEST_SUITE(FF_TEST_SUITE) { /*output_labels=*/{}); parallel_layer_guid_t input_layer = input_added.parallel_layer; - UnmappedOpCostEstimateKey input_key = make_input_key(par_input_shape); + UnmappedRuntimeOnlyOpCostEstimateKey input_key = + make_input_key(par_input_shape); PCGBinarySPDecomposition sp_decomposition = PCGBinarySPDecomposition{input_layer}; @@ -129,7 +132,8 @@ TEST_SUITE(FF_TEST_SUITE) { parallel_layer_guid_t input_layer = input_added.parallel_layer; parallel_tensor_guid_t input = get_only(input_added.outputs); - UnmappedOpCostEstimateKey input_key = make_input_key(par_input_shape); + UnmappedRuntimeOnlyOpCostEstimateKey input_key = + make_input_key(par_input_shape); PCGOperatorAttrs relu_attrs = PCGOperatorAttrs{ ElementUnaryAttrs{ @@ -143,12 +147,13 @@ TEST_SUITE(FF_TEST_SUITE) { parallel_layer_guid_t relu_layer = relu_added.parallel_layer; parallel_tensor_guid_t relu_output = get_only(relu_added.outputs); - UnmappedOpCostEstimateKey relu_key = UnmappedOpCostEstimateKey{ - /*op_attrs=*/relu_attrs, - /*input_shapes=*/{par_input_shape}, - /*weight_shapes=*/{}, - /*output_shapes=*/{relu_output_shape}, - }; + UnmappedRuntimeOnlyOpCostEstimateKey relu_key = + UnmappedRuntimeOnlyOpCostEstimateKey{ + /*op_attrs=*/relu_attrs, + /*input_shapes=*/{par_input_shape}, + /*weight_shapes=*/{}, + /*output_shapes=*/{relu_output_shape}, + }; PCGBinarySPDecomposition sp_decomposition = pcg_make_series( pcg_make_leaf(input_layer), pcg_make_leaf(relu_layer)); @@ -180,12 +185,14 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelLayerAddedResult input1_added = pcg_add_input_layer(pcg, input_shape); parallel_layer_guid_t input1_layer = input1_added.parallel_layer; - UnmappedOpCostEstimateKey input1_key = make_input_key(par_input_shape); + UnmappedRuntimeOnlyOpCostEstimateKey input1_key = + make_input_key(par_input_shape); ParallelLayerAddedResult input2_added = pcg_add_input_layer(pcg, input_shape); parallel_layer_guid_t input2_layer = input2_added.parallel_layer; - UnmappedOpCostEstimateKey input2_key = make_input_key(par_input_shape); + UnmappedRuntimeOnlyOpCostEstimateKey input2_key = + make_input_key(par_input_shape); PCGBinarySPDecomposition sp_decomposition = pcg_make_parallel( pcg_make_leaf(input1_layer), pcg_make_leaf(input2_layer)); @@ -205,13 +212,15 @@ TEST_SUITE(FF_TEST_SUITE) { pcg_add_input_layer(pcg, input_shape); parallel_layer_guid_t input1_layer = input1_added.parallel_layer; parallel_tensor_guid_t input1_tensor = get_only(input1_added.outputs); - UnmappedOpCostEstimateKey input1_key = make_input_key(par_input_shape); + UnmappedRuntimeOnlyOpCostEstimateKey input1_key = + make_input_key(par_input_shape); ParallelLayerAddedResult input2_added = pcg_add_input_layer(pcg, input_shape); parallel_layer_guid_t input2_layer = input2_added.parallel_layer; parallel_tensor_guid_t input2_tensor = get_only(input2_added.outputs); - UnmappedOpCostEstimateKey input2_key = make_input_key(par_input_shape); + UnmappedRuntimeOnlyOpCostEstimateKey input2_key = + make_input_key(par_input_shape); PCGOperatorAttrs ew_op_attrs = PCGOperatorAttrs{ ElementBinaryAttrs{ @@ -228,12 +237,13 @@ TEST_SUITE(FF_TEST_SUITE) { {input1_tensor, input2_tensor}, {}); parallel_layer_guid_t ew_op_layer = ew_op_added.parallel_layer; - UnmappedOpCostEstimateKey ew_op_key = UnmappedOpCostEstimateKey{ - /*op_attrs=*/ew_op_attrs, - /*input_shapes=*/{par_input_shape, par_input_shape}, - /*weight_shapes=*/{}, - /*output_shapes=*/{ew_op_output_shape}, - }; + UnmappedRuntimeOnlyOpCostEstimateKey ew_op_key = + UnmappedRuntimeOnlyOpCostEstimateKey{ + /*op_attrs=*/ew_op_attrs, + /*input_shapes=*/{par_input_shape, par_input_shape}, + /*weight_shapes=*/{}, + /*output_shapes=*/{ew_op_output_shape}, + }; PCGBinarySPDecomposition sp_decomposition = pcg_make_series(pcg_make_parallel(pcg_make_leaf(input1_layer), diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc index c7a757d91f..26f61253c3 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc @@ -36,7 +36,7 @@ TEST_SUITE(FF_TEST_SUITE) { }, }; - float pre_cost = 2.0; + milliseconds_t pre_cost = 2.0_ms; MachineMappingResult pre = MachineMappingResult{ FeasibleMachineMappingResult{ /*runtime=*/pre_cost, @@ -58,7 +58,7 @@ TEST_SUITE(FF_TEST_SUITE) { }, }; - float post_cost = 4.0; + milliseconds_t post_cost = 4.0_ms; MachineMappingResult post = MachineMappingResult{ FeasibleMachineMappingResult{ /*runtime=*/post_cost, @@ -74,7 +74,7 @@ TEST_SUITE(FF_TEST_SUITE) { MachineMappingResult infeasible = infeasible_machine_mapping_result(); - float comm_cost = 3.0; + milliseconds_t comm_cost = 3.0_ms; SUBCASE("pre is infeasible") { MachineMappingResult result = series_combine( @@ -219,7 +219,7 @@ TEST_SUITE(FF_TEST_SUITE) { MachineMappingResult lhs = MachineMappingResult{ FeasibleMachineMappingResult{ - /*runtime=*/2.0, + /*runtime=*/2_ms, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ { @@ -240,7 +240,7 @@ TEST_SUITE(FF_TEST_SUITE) { MachineMappingResult rhs = MachineMappingResult{ FeasibleMachineMappingResult{ - /*runtime=*/4.0, + /*runtime=*/4_ms, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ { @@ -278,7 +278,7 @@ TEST_SUITE(FF_TEST_SUITE) { MachineMappingResult result = parallel_combine(lhs, rhs); MachineMappingResult correct = MachineMappingResult{ FeasibleMachineMappingResult{ - /*runtime=*/4.0, + /*runtime=*/4_ms, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ { @@ -342,7 +342,7 @@ TEST_SUITE(FF_TEST_SUITE) { MachineMappingResult faster = MachineMappingResult{ FeasibleMachineMappingResult{ - /*runtime=*/2.0, + /*runtime=*/2_ms, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ { @@ -363,7 +363,7 @@ TEST_SUITE(FF_TEST_SUITE) { MachineMappingResult slower = MachineMappingResult{ FeasibleMachineMappingResult{ - /*runtime=*/4.0, + /*runtime=*/4_ms, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ { diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc index 22202c36f7..96b11e6d33 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc @@ -1,10 +1,10 @@ #include "compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h" -#include "../../cost_estimator_for_test.h" #include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.h" #include "compiler/machine_mapping/machine_mapping_constraints.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h" #include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_cache.h" +#include "internal/cost_estimator_for_test.h" #include "op-attrs/parallel_tensor_shape.h" #include "pcg/machine_view.h" #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h" @@ -18,7 +18,8 @@ using namespace FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("get_optimal_machine_mapping_with_memory") { auto make_leaf = [](UnmappedOpCostEstimateKey const &k) { - return MachineMappingProblemTree{k}; + return MachineMappingProblemTree{ + runtime_only_from_unmapped_op_cost_estimate_key(k)}; }; auto make_series_split = @@ -90,14 +91,15 @@ TEST_SUITE(FF_TEST_SUITE) { /*intra_node_bandwidth=*/1, }; - auto allowed_machine_views1 = [&](UnmappedOpCostEstimateKey const &, - MachineSpecification const &resources) { - if (resources == full_machine_spec) { - return std::unordered_set{mv1, mv2}; - } else { - return std::unordered_set{mv2}; - } - }; + auto allowed_machine_views1 = + [&](UnmappedRuntimeOnlyOpCostEstimateKey const &, + MachineSpecification const &resources) { + if (resources == full_machine_spec) { + return std::unordered_set{mv1, mv2}; + } else { + return std::unordered_set{mv2}; + } + }; TensorShape tensor_shape = TensorShape{ TensorDims{ @@ -111,11 +113,21 @@ TEST_SUITE(FF_TEST_SUITE) { ParallelTensorShape par_tensor_shape = lift_to_parallel(tensor_shape); + OptimizerAttrs optimizer_attrs = OptimizerAttrs{ + SGDOptimizerAttrs{ + /*lr=*/0.1, + /*momentum=*/0.1, + /*nesterov=*/false, + /*weight_decay=*/0.1, + }, + }; + UnmappedOpCostEstimateKey k1 = UnmappedOpCostEstimateKey{ /*op_attrs=*/PCGOperatorAttrs{InputAttrs{tensor_shape}}, /*input_shapes=*/{}, /*weight_shapes=*/{}, /*output_shapes=*/{}, + /*optimizer_attrs=*/optimizer_attrs, }; UnmappedOpCostEstimateKey k2 = UnmappedOpCostEstimateKey{ @@ -128,6 +140,7 @@ TEST_SUITE(FF_TEST_SUITE) { /*input_shapes=*/{}, /*weight_shapes=*/{}, /*output_shapes=*/{}, + /*optimizer_attrs=*/optimizer_attrs, }; AbstractedTensorSetMovement movement1 = AbstractedTensorSetMovement{{ @@ -150,36 +163,37 @@ TEST_SUITE(FF_TEST_SUITE) { CostEstimator cost_estimator = make_fake_cost_estimator( std::unordered_map{{ {map_unmapped_op_cost_estimate_key(k1, mv1), - OpCostMetrics{/*forward_runtime=*/1.0, - /*backward_runtime=*/1.0, - /*memory=*/nonnegative_int{2}}}, + OpCostMetrics{/*forward_runtime=*/1_ms, + /*backward_runtime=*/1_ms, + /*memory_usage=*/2_bytes}}, {map_unmapped_op_cost_estimate_key(k2, mv1), - OpCostMetrics{/*forward_runtime=*/2.0, - /*backward_runtime=*/2.0, - /*memory=*/nonnegative_int{3}}}, + OpCostMetrics{/*forward_runtime=*/2_ms, + /*backward_runtime=*/2_ms, + /*memory_usage=*/3_bytes}}, {map_unmapped_op_cost_estimate_key(k1, mv2), - OpCostMetrics{/*forward_runtime=*/1.5, - /*backward_runtime=*/1.5, - /*memory=*/nonnegative_int{1}}}, + OpCostMetrics{/*forward_runtime=*/1.5_ms, + /*backward_runtime=*/1.5_ms, + /*memory_usage=*/1_bytes}}, {map_unmapped_op_cost_estimate_key(k2, mv2), - OpCostMetrics{/*forward_runtime=*/2.5, - /*backward_runtime=*/2.5, - /*memory=*/nonnegative_int{2}}}, + OpCostMetrics{/*forward_runtime=*/2.5_ms, + /*backward_runtime=*/2.5_ms, + /*memory_usage=*/2_bytes}}, }}, - std::unordered_map{{ - {TensorSetMovement{/*movements=*/{}}, /*cost=*/0.0}, + std::unordered_map{{ + {TensorSetMovement{/*movements=*/{}}, /*cost=*/0.0_ms}, {concretize_abstracted_tensor_set_movement(movement1, mm1, mm1), - /*cost=*/0.1}, + /*cost=*/0.1_ms}, {concretize_abstracted_tensor_set_movement(movement1, mm2, mm2), - /*cost=*/0.2}, + /*cost=*/0.2_ms}, {concretize_abstracted_tensor_set_movement(movement1, mm1, mm2), - /*cost=*/0.3}, + /*cost=*/0.3_ms}, {concretize_abstracted_tensor_set_movement(movement1, mm2, mm1), - /*cost=*/0.4}, + /*cost=*/0.4_ms}, }}); - MachineMappingContext context = MachineMappingContext{ + MachineMappingWithMemoryContext context = MachineMappingWithMemoryContext{ cost_estimator, + optimizer_attrs, allowed_machine_views1, }; @@ -198,17 +212,17 @@ TEST_SUITE(FF_TEST_SUITE) { cache, context, problem_tree, full_machine_spec, constraints); MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{ MachineMappingForSingleLayer{ - OpCostMetrics{/*forward_runtime=*/1.0, - /*backward_runtime=*/1.0, - /*memory=*/nonnegative_int{2}}, + OpCostMetrics{/*forward_runtime=*/1_ms, + /*backward_runtime=*/1_ms, + /*memory_usage=*/2_bytes}, ParallelLayerGuidObliviousMachineMapping{{ {binary_tree_root_path(), mv1}, }}, }, MachineMappingForSingleLayer{ - OpCostMetrics{/*forward_runtime=*/1.5, - /*backward_runtime=*/1.5, - /*memory=*/nonnegative_int{1}}, + OpCostMetrics{/*forward_runtime=*/1.5_ms, + /*backward_runtime=*/1.5_ms, + /*memory_usage=*/1_bytes}, ParallelLayerGuidObliviousMachineMapping{{ {binary_tree_root_path(), mv2}, }}, @@ -232,9 +246,9 @@ TEST_SUITE(FF_TEST_SUITE) { MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{ MachineMappingForSingleLayer{ OpCostMetrics{ - /*forward_runtime=*/1.0 + 2.0 + 0.1, - /*backward_runtime=*/1.0 + 2.0 + 0.1, - /*memory=*/nonnegative_int{2 + 3}, + /*forward_runtime=*/1.0_ms + 2.0_ms + 0.1_ms, + /*backward_runtime=*/1.0_ms + 2.0_ms + 0.1_ms, + /*memory_usage=*/2_bytes + 3_bytes, }, ParallelLayerGuidObliviousMachineMapping{{ { @@ -252,9 +266,9 @@ TEST_SUITE(FF_TEST_SUITE) { }}, }, MachineMappingForSingleLayer{ - OpCostMetrics{/*forward_runtime=*/1.5 + 2.5 + 0.1, - /*backward_runtime=*/1.5 + 2.5 + 0.1, - /*memory=*/nonnegative_int{1 + 2}}, + OpCostMetrics{/*forward_runtime=*/1.5_ms + 2.5_ms + 0.1_ms, + /*backward_runtime=*/1.5_ms + 2.5_ms + 0.1_ms, + /*memory_usage=*/1_bytes + 2_bytes}, ParallelLayerGuidObliviousMachineMapping{{ { BinaryTreePath{{ @@ -288,9 +302,9 @@ TEST_SUITE(FF_TEST_SUITE) { cache, context, problem_tree, full_machine_spec, constraints); MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{MachineMappingForSingleLayer{ - OpCostMetrics{/*forward_runtime=*/2.5, - /*backward_runtime=*/2.5, - /*memory=*/nonnegative_int{2}}, + OpCostMetrics{/*forward_runtime=*/2.5_ms, + /*backward_runtime=*/2.5_ms, + /*memory_usage=*/2_bytes}, ParallelLayerGuidObliviousMachineMapping{{ { BinaryTreePath{{ diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc index 35b55d2273..2192b442cd 100644 --- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc +++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc @@ -53,21 +53,21 @@ TEST_SUITE(FF_TEST_SUITE) { }; OpCostMetrics cost1 = OpCostMetrics{ - /*forward_runtime=*/2.0, - /*backward_runtime=*/2.0, - /*memory=*/2_n, + /*forward_runtime=*/2_ms, + /*backward_runtime=*/2_ms, + /*memory_usage=*/2_bytes, }; OpCostMetrics cost2 = OpCostMetrics{ - /*forward_runtime=*/4.0, - /*backward_runtime=*/4.0, - /*memory=*/1_n, + /*forward_runtime=*/4_ms, + /*backward_runtime=*/4_ms, + /*memory_usage=*/1_bytes, }; OpCostMetrics cost3 = OpCostMetrics{ - /*forward_runtime=*/2.0, - /*backward_runtime=*/2.0, - /*memory=*/3_n, + /*forward_runtime=*/2_ms, + /*backward_runtime=*/2_ms, + /*memory_usage=*/3_bytes, }; MachineMappingForSingleLayer mm1 = MachineMappingForSingleLayer{ @@ -188,9 +188,9 @@ TEST_SUITE(FF_TEST_SUITE) { }; OpCostMetrics pre_cost = OpCostMetrics{ - /*forward_runtime=*/2.0, - /*backward_runtime=*/2.0, - /*memory=*/2_n, + /*forward_runtime=*/2_ms, + /*backward_runtime=*/2_ms, + /*memory_usage=*/2_bytes, }; MachineMappingWithMemoryResult pre = MachineMappingWithMemoryResult{{ MachineMappingForSingleLayer{ @@ -215,9 +215,9 @@ TEST_SUITE(FF_TEST_SUITE) { }}; OpCostMetrics post_cost = OpCostMetrics{ - /*forward_runtime=*/4.0, - /*backward_runtime=*/4.0, - /*memory=*/1_n, + /*forward_runtime=*/4_ms, + /*backward_runtime=*/4_ms, + /*memory_usage=*/1_bytes, }; MachineMappingWithMemoryResult post = MachineMappingWithMemoryResult{{ @@ -237,7 +237,7 @@ TEST_SUITE(FF_TEST_SUITE) { MachineMappingWithMemoryResult empty = empty_machine_mapping_with_memory_result(); - float comm_cost = 3.0; + milliseconds_t comm_cost = 3_ms; SUBCASE("pre is empty") { MachineMappingWithMemoryResult result = series_combine( @@ -265,7 +265,8 @@ TEST_SUITE(FF_TEST_SUITE) { comm_cost + post_cost.forward_runtime, /*backward_runtime=*/pre_cost.backward_runtime + comm_cost + post_cost.backward_runtime, - /*memory=*/pre_cost.memory + post_cost.memory, + /*memory_usage=*/pre_cost.memory_usage + + post_cost.memory_usage, }, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ @@ -321,7 +322,8 @@ TEST_SUITE(FF_TEST_SUITE) { comm_cost + post_cost.forward_runtime, /*backward_runtime=*/pre_cost.backward_runtime + comm_cost + post_cost.backward_runtime, - /*memory=*/pre_cost.memory + post_cost.memory, + /*memory_usage=*/pre_cost.memory_usage + + post_cost.memory_usage, }, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{{ @@ -389,9 +391,9 @@ TEST_SUITE(FF_TEST_SUITE) { }; OpCostMetrics lhs_cost = OpCostMetrics{ - /*forward_runtime=*/2.0, - /*backward_runtime=*/2.0, - /*memory=*/2_n, + /*forward_runtime=*/2_ms, + /*backward_runtime=*/2_ms, + /*memory_usage=*/2_bytes, }; MachineMappingWithMemoryResult lhs = MachineMappingWithMemoryResult{{ MachineMappingForSingleLayer{ @@ -416,9 +418,9 @@ TEST_SUITE(FF_TEST_SUITE) { }}; OpCostMetrics rhs_cost = OpCostMetrics{ - /*forward_runtime=*/4.0, - /*backward_runtime=*/4.0, - /*memory=*/1_n, + /*forward_runtime=*/4_ms, + /*backward_runtime=*/4_ms, + /*memory_usage=*/1_bytes, }; MachineMappingWithMemoryResult rhs = MachineMappingWithMemoryResult{{ MachineMappingForSingleLayer{ @@ -461,7 +463,8 @@ TEST_SUITE(FF_TEST_SUITE) { /*backward_runtime=*/ std::max(lhs_cost.backward_runtime, rhs_cost.backward_runtime), - /*memory=*/std::max(lhs_cost.memory, rhs_cost.memory), + /*memory_usage=*/ + std::max(lhs_cost.memory_usage, rhs_cost.memory_usage), }, /*machine_mapping=*/ ParallelLayerGuidObliviousMachineMapping{ @@ -536,19 +539,19 @@ TEST_SUITE(FF_TEST_SUITE) { }; OpCostMetrics cost1 = OpCostMetrics{ - /*forward_runtime=*/2.0, - /*backward_runtime=*/2.0, - /*memory=*/2_n, + /*forward_runtime=*/2_ms, + /*backward_runtime=*/2_ms, + /*memory_usage=*/2_bytes, }; OpCostMetrics cost2 = OpCostMetrics{ - /*forward_runtime=*/4.0, - /*backward_runtime=*/4.0, - /*memory=*/1_n, + /*forward_runtime=*/4_ms, + /*backward_runtime=*/4_ms, + /*memory_usage=*/1_bytes, }; OpCostMetrics cost3 = OpCostMetrics{ - /*forward_runtime=*/2.0, - /*backward_runtime=*/2.0, - /*memory=*/3_n, + /*forward_runtime=*/2_ms, + /*backward_runtime=*/2_ms, + /*memory_usage=*/3_bytes, }; MachineMappingForSingleLayer mm1 = MachineMappingForSingleLayer{ diff --git a/lib/compiler/test/src/compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.cc b/lib/compiler/test/src/compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.cc index 13f15f6db3..81531d7073 100644 --- a/lib/compiler/test/src/compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.cc +++ b/lib/compiler/test/src/compiler/series_parallel/pcg/get_pcg_series_parallel_decomposition.cc @@ -296,7 +296,7 @@ TEST_SUITE(FF_TEST_SUITE) { parallel_tensor_guid_t t_p4 = get_only(p4_added.outputs); RepartitionAttrs p5_attrs = RepartitionAttrs{ - /*repartition_dim=*/ff_dim_t{0_n}, + /*repartition_dim=*/ff_dim_t{1_n}, /*repartition_degree=*/2_p, }; ParallelLayerAddedResult p5_added = diff --git a/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc b/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc index c3c83dd6b8..6571b78540 100644 --- a/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc +++ b/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc @@ -1,10 +1,10 @@ #include "compiler/task_graph_simulator/task_simulator.h" -#include "../cost_estimator_for_test.h" #include "compiler/cost_estimator/cost_estimator.h" #include "compiler/cost_estimator/op_cost_metrics.dtg.h" #include "compiler/machine_mapping/machine_mapping.dtg.h" #include "compiler/machine_mapping/machine_mapping.h" #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h" +#include "internal/runtime_only_cost_estimator_for_test.h" #include "op-attrs/ops/input_attrs.dtg.h" #include "op-attrs/parallel_tensor_dims.dtg.h" #include "op-attrs/parallel_tensor_shape.dtg.h" @@ -83,41 +83,45 @@ TEST_SUITE(FF_TEST_SUITE) { }}; SUBCASE("constant op, comm cost") { - CostEstimator estimator = make_fake_constant_cost_estimator( - /*forward_op_cost=*/10.0f, - /*backward_op_cost=*/10.0f, - /*comm_cost=*/1.0f, - /*memory_cost=*/0_n); + RuntimeOnlyCostEstimator estimator = + make_fake_constant_runtime_only_cost_estimator( + /*forward_op_cost=*/10_ms, + /*backward_op_cost=*/10_ms, + /*comm_cost=*/1_ms); - float result = task_simulator_estimate_forward_pass_time( + milliseconds_t result = task_simulator_estimate_forward_pass_time( pcg, estimator, device_mapping, machine_spec); - float correct = 10 + 1 + 10; + milliseconds_t correct = 10_ms + 1_ms + 10_ms; CHECK(result == correct); } SUBCASE("variable op, comm cost") { - CostEstimator cost_estimator = make_fake_cost_estimator( - [](OpCostEstimateKey const &op) { - if (op.op_attrs.has()) { - return OpCostMetrics{/*forward_runtime=*/10.0f, - /*backward_runtime=*/10.0f, - /*memory=*/0_n}; // layer0 - } - if (op.op_attrs.has()) { - return OpCostMetrics{/*forward_runtime=*/1.0f, - /*backward_runtime=*/1.0f, - /*memory=*/0_n}; // layer1 - } - return OpCostMetrics{/*forward_runtime=*/0.0f, - /*backward_runtime=*/0.0f, - /*memory=*/0_n}; - }, - [](TensorSetMovement const &comm) { return 5.0f; }); + RuntimeOnlyCostEstimator cost_estimator = + make_fake_runtime_only_cost_estimator( + [](RuntimeOnlyOpCostEstimateKey const &key) { + if (key.op_attrs.has()) { + return RuntimeOnlyOpCostMetrics{ + /*forward_runtime=*/10_ms, + /*backward_runtime=*/10_ms, + }; // layer0 + } else if (key.op_attrs.has()) { + return RuntimeOnlyOpCostMetrics{ + /*forward_runtime=*/1_ms, + /*backward_runtime=*/1_ms, + }; // layer1 + } else { + return RuntimeOnlyOpCostMetrics{ + /*forward_runtime=*/0_ms, + /*backward_runtime=*/0_ms, + }; + } + }, + [](TensorSetMovement const &comm) { return 5_ms; }); - float result = task_simulator_estimate_forward_pass_time( + milliseconds_t result = task_simulator_estimate_forward_pass_time( pcg, cost_estimator, device_mapping, machine_spec); - float correct = 10 + 5 + 1; + milliseconds_t correct = 10_ms + 5_ms + 1_ms; CHECK(result == correct); } } @@ -173,41 +177,47 @@ TEST_SUITE(FF_TEST_SUITE) { {layer2, mv2}, {layer3, mv3}, }}; + SUBCASE("constant op, comm cost") { - CostEstimator estimator = make_fake_constant_cost_estimator( - /*forward_op_cost=*/10.0f, - /*backward_op_cost=*/10.0f, - /*comm_cost=*/1.0f, - /*memory_cost=*/0_n); + RuntimeOnlyCostEstimator estimator = + make_fake_constant_runtime_only_cost_estimator( + /*forward_op_cost=*/10_ms, + /*backward_op_cost=*/10_ms, + /*comm_cost=*/1_ms); - float result = task_simulator_estimate_forward_pass_time( + milliseconds_t result = task_simulator_estimate_forward_pass_time( pcg, estimator, device_mapping, machine_spec); - float correct = 10 + 1 + 10 + 1 + 10; + milliseconds_t correct = 10_ms + 1_ms + 10_ms + 1_ms + 10_ms; CHECK(result == correct); } + SUBCASE("variable op, comm cost") { - CostEstimator cost_estimator = make_fake_cost_estimator( - [](OpCostEstimateKey const &op) { - if (op.op_attrs.has()) { - return OpCostMetrics{/*forward_runtime=*/10.0f, - /*backward_runtime=*/10.0f, - /*memory=*/0_n}; // layer0 - } - if (op.op_attrs.has()) { - return OpCostMetrics{/*forward_runtime=*/1.0f, - /*backward_runtime=*/1.0f, - /*memory=*/0_n}; // layers 1, 2 - } - if (op.op_attrs.has()) { - return OpCostMetrics{/*forward_runtime=*/2.0f, - /*backward_runtime=*/2.0f, - /*memory=*/0_n}; // layer3 - } - return OpCostMetrics{/*forward_runtime=*/0.0f, - /*backward_runtime=*/0.0f, - /*memory=*/0_n}; - }, - [](TensorSetMovement const &comm) { return 5.0f; }); + RuntimeOnlyCostEstimator cost_estimator = + make_fake_runtime_only_cost_estimator( + [](RuntimeOnlyOpCostEstimateKey const &key) { + if (key.op_attrs.has()) { + return RuntimeOnlyOpCostMetrics{ + /*forward_runtime=*/10_ms, + /*backward_runtime=*/10_ms, + }; // layer0 + } else if (key.op_attrs.has()) { + return RuntimeOnlyOpCostMetrics{ + /*forward_runtime=*/1_ms, + /*backward_runtime=*/1_ms, + }; // layers 1, 2 + } else if (key.op_attrs.has()) { + return RuntimeOnlyOpCostMetrics{ + /*forward_runtime=*/2_ms, + /*backward_runtime=*/2_ms, + }; // layer3 + } else { + return RuntimeOnlyOpCostMetrics{ + /*forward_runtime=*/0_ms, + /*backward_runtime=*/0_ms, + }; + } + }, + [](TensorSetMovement const &comm) { return 5_ms; }); } } @@ -220,44 +230,50 @@ TEST_SUITE(FF_TEST_SUITE) { {layer2, mv}, {layer3, mv}, }}; + SUBCASE("constant op, cost cost") { - CostEstimator cost_estimator = make_fake_constant_cost_estimator( - /*forward_op_cost=*/10.0f, - /*backward_op_cost=*/10.0f, - /*comm_cost=*/1.0f, - /*memory_cost=*/0_n); + RuntimeOnlyCostEstimator cost_estimator = + make_fake_constant_runtime_only_cost_estimator( + /*forward_op_cost=*/10_ms, + /*backward_op_cost=*/10_ms, + /*comm_cost=*/1_ms); - float result = task_simulator_estimate_forward_pass_time( + milliseconds_t result = task_simulator_estimate_forward_pass_time( pcg, cost_estimator, device_mapping, machine_spec); - float correct = 10 + 10 + 10 + 10 + 1 + 1; + milliseconds_t correct = 10_ms + 10_ms + 10_ms + 10_ms + 1_ms + 1_ms; CHECK(result == correct); } + SUBCASE("variable op, cost cost") { - CostEstimator cost_estimator = make_fake_cost_estimator( - [](OpCostEstimateKey const &op) { - if (op.op_attrs.has()) { - return OpCostMetrics{/*forward_runtime=*/10.0f, - /*backward_runtime=*/10.0f, - /*memory=*/0_n}; // layer0 - } - if (op.op_attrs.has()) { - return OpCostMetrics{/*forward_runtime=*/1.0f, - /*backward_runtime=*/1.0f, - /*memory=*/0_n}; // layers 1, 2 - } - if (op.op_attrs.has()) { - return OpCostMetrics{/*forward_runtime=*/2.0f, - /*backward_runtime=*/2.0f, - /*memory=*/0_n}; // layer3 - } - return OpCostMetrics{/*forward_runtime=*/0.0f, - /*backward_runtime=*/0.0f, - /*memory=*/0_n}; - }, - [](TensorSetMovement const &comm) { return 5.0f; }); - float result = task_simulator_estimate_forward_pass_time( + RuntimeOnlyCostEstimator cost_estimator = + make_fake_runtime_only_cost_estimator( + [](RuntimeOnlyOpCostEstimateKey const &key) { + if (key.op_attrs.has()) { + return RuntimeOnlyOpCostMetrics{ + /*forward_runtime=*/10_ms, + /*backward_runtime=*/10_ms, + }; // layer0 + } else if (key.op_attrs.has()) { + return RuntimeOnlyOpCostMetrics{ + /*forward_runtime=*/1_ms, + /*backward_runtime=*/1_ms, + }; // layers 1, 2 + } else if (key.op_attrs.has()) { + return RuntimeOnlyOpCostMetrics{ + /*forward_runtime=*/2_ms, + /*backward_runtime=*/2_ms, + }; // layer3 + } else { + return RuntimeOnlyOpCostMetrics{ + /*forward_runtime=*/0_ms, + /*backward_runtime=*/0_ms, + }; + } + }, + [](TensorSetMovement const &comm) { return 5_ms; }); + milliseconds_t result = task_simulator_estimate_forward_pass_time( pcg, cost_estimator, device_mapping, machine_spec); - float correct = 10 + 5 + (1 + 1) + 5 + 2; + milliseconds_t correct = 10_ms + 5_ms + (1_ms + 1_ms) + 5_ms + 2_ms; CHECK(result == correct); } } diff --git a/lib/compiler/test/src/compiler/cost_estimator_for_test.cc b/lib/compiler/test/src/internal/cost_estimator_for_test.cc similarity index 73% rename from lib/compiler/test/src/compiler/cost_estimator_for_test.cc rename to lib/compiler/test/src/internal/cost_estimator_for_test.cc index 48e6f5e561..60bf6ba7a4 100644 --- a/lib/compiler/test/src/compiler/cost_estimator_for_test.cc +++ b/lib/compiler/test/src/internal/cost_estimator_for_test.cc @@ -9,7 +9,7 @@ namespace FlexFlow { TestCostEstimator::TestCostEstimator( std::function const &get_operator_cost, - std::function const + std::function const &get_communication_cost) : get_operator_cost(get_operator_cost), get_communication_cost(get_communication_cost) {} @@ -19,14 +19,15 @@ OpCostMetrics return this->get_operator_cost(k); } -float TestCostEstimator::estimate_cost(TensorSetMovement const &m) const { +milliseconds_t + TestCostEstimator::estimate_cost(TensorSetMovement const &m) const { return this->get_communication_cost(m); } CostEstimator make_fake_cost_estimator( std::function const &get_operator_cost, - std::function const + std::function const &get_communication_cost) { return CostEstimator::create(get_operator_cost, get_communication_cost); @@ -34,7 +35,8 @@ CostEstimator make_fake_cost_estimator( CostEstimator make_fake_cost_estimator( std::unordered_map const &op_cost_map, - std::unordered_map const &comm_cost_map) { + std::unordered_map const + &comm_cost_map) { return make_fake_cost_estimator( [op_cost_map](OpCostEstimateKey const &k) { return op_cost_map.at(k); }, [comm_cost_map](TensorSetMovement const &m) { @@ -42,10 +44,10 @@ CostEstimator make_fake_cost_estimator( }); } -CostEstimator make_fake_constant_cost_estimator(float forward_op_cost, - float backward_op_cost, - float comm_cost, - nonnegative_int memory_cost) { +CostEstimator make_fake_constant_cost_estimator(milliseconds_t forward_op_cost, + milliseconds_t backward_op_cost, + milliseconds_t comm_cost, + num_bytes_t memory_cost) { return make_fake_cost_estimator( [=](OpCostEstimateKey const &op) { return OpCostMetrics{forward_op_cost, backward_op_cost, memory_cost}; diff --git a/lib/compiler/test/src/compiler/cost_estimator_for_test.h b/lib/compiler/test/src/internal/cost_estimator_for_test.h similarity index 69% rename from lib/compiler/test/src/compiler/cost_estimator_for_test.h rename to lib/compiler/test/src/internal/cost_estimator_for_test.h index 1e8ce83caf..6a0094839c 100644 --- a/lib/compiler/test/src/compiler/cost_estimator_for_test.h +++ b/lib/compiler/test/src/internal/cost_estimator_for_test.h @@ -13,7 +13,8 @@ namespace FlexFlow { struct TestCostEstimator : public ICostEstimator { std::function get_operator_cost; - std::function get_communication_cost; + std::function + get_communication_cost; TestCostEstimator() = delete; TestCostEstimator(decltype(get_operator_cost) const &get_operator_cost, @@ -22,23 +23,23 @@ struct TestCostEstimator : public ICostEstimator { OpCostMetrics estimate_cost(OpCostEstimateKey const &) const override; - float estimate_cost(TensorSetMovement const &) const override; + milliseconds_t estimate_cost(TensorSetMovement const &) const override; }; CostEstimator make_fake_cost_estimator( std::function const &get_operator_cost, - std::function const + std::function const &get_communication_cost); CostEstimator make_fake_cost_estimator( std::unordered_map const &op_cost_map, - std::unordered_map const &comm_cost_map); + std::unordered_map const &comm_cost_map); -CostEstimator make_fake_constant_cost_estimator(float forward_op_cost, - float backward_op_cost, - float comm_cost, - nonnegative_int memory_cost); +CostEstimator make_fake_constant_cost_estimator(milliseconds_t forward_op_cost, + milliseconds_t backward_op_cost, + milliseconds_t comm_cost, + num_bytes_t memory_cost); } // namespace FlexFlow diff --git a/lib/compiler/test/src/internal/runtime_only_cost_estimator_for_test.cc b/lib/compiler/test/src/internal/runtime_only_cost_estimator_for_test.cc new file mode 100644 index 0000000000..c52344c6b3 --- /dev/null +++ b/lib/compiler/test/src/internal/runtime_only_cost_estimator_for_test.cc @@ -0,0 +1,52 @@ +#include "internal/runtime_only_cost_estimator_for_test.h" +#include "compiler/cost_estimator/op_cost_estimate_key.dtg.h" +#include "compiler/cost_estimator/op_cost_estimate_key.h" +#include "compiler/cost_estimator/op_cost_metrics.dtg.h" +#include "compiler/cost_estimator/op_cost_metrics.h" +#include "compiler/cost_estimator/runtime_only_cost_estimator_from_cost_estimator.h" +#include "internal/cost_estimator_for_test.h" + +namespace FlexFlow { + +RuntimeOnlyCostEstimator make_fake_runtime_only_cost_estimator( + std::function const &get_operator_cost, + std::function const + &get_communication_cost) { + return runtime_only_cost_estimator_from_cost_estimator( + make_fake_cost_estimator( + [get_operator_cost](OpCostEstimateKey const &key) -> OpCostMetrics { + RuntimeOnlyOpCostMetrics runtime_only_op_cost_metrics = + get_operator_cost(runtime_only_from_op_cost_estimate_key(key)); + return make_op_cost_metrics_from_runtime_only( + runtime_only_op_cost_metrics, 0_bytes); + }, + get_communication_cost)); +} + +RuntimeOnlyCostEstimator make_fake_runtime_only_cost_estimator( + std::unordered_map const &op_cost_map, + std::unordered_map const + &comm_cost_map) { + return make_fake_runtime_only_cost_estimator( + [op_cost_map](RuntimeOnlyOpCostEstimateKey const &k) { + return op_cost_map.at(k); + }, + [comm_cost_map](TensorSetMovement const &m) { + return comm_cost_map.at(m); + }); +} + +RuntimeOnlyCostEstimator make_fake_constant_runtime_only_cost_estimator( + milliseconds_t forward_op_cost, + milliseconds_t backward_op_cost, + milliseconds_t comm_cost) { + return make_fake_runtime_only_cost_estimator( + [=](RuntimeOnlyOpCostEstimateKey const &op) { + return RuntimeOnlyOpCostMetrics{forward_op_cost, backward_op_cost}; + }, + [=](TensorSetMovement const &op) { return comm_cost; }); +} + +} // namespace FlexFlow diff --git a/lib/compiler/test/src/internal/runtime_only_cost_estimator_for_test.h b/lib/compiler/test/src/internal/runtime_only_cost_estimator_for_test.h new file mode 100644 index 0000000000..2b5824263d --- /dev/null +++ b/lib/compiler/test/src/internal/runtime_only_cost_estimator_for_test.h @@ -0,0 +1,26 @@ +#ifndef _FLEXFLOW_LIB_COMPILER_TEST_SRC_INTERNAL_RUNTIME_ONLY_COST_ESTIMATOR_FOR_TEST_H +#define _FLEXFLOW_LIB_COMPILER_TEST_SRC_INTERNAL_RUNTIME_ONLY_COST_ESTIMATOR_FOR_TEST_H + +#include "compiler/cost_estimator/runtime_only_cost_estimator.h" + +namespace FlexFlow { + +RuntimeOnlyCostEstimator make_fake_runtime_only_cost_estimator( + std::function const &get_operator_cost, + std::function const + &get_communication_cost); + +RuntimeOnlyCostEstimator make_fake_runtime_only_cost_estimator( + std::unordered_map const &op_cost_map, + std::unordered_map const &comm_cost_map); + +RuntimeOnlyCostEstimator make_fake_constant_runtime_only_cost_estimator( + milliseconds_t forward_op_cost, + milliseconds_t backward_op_cost, + milliseconds_t comm_cost); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/accessor.h b/lib/kernels/include/kernels/accessor.h index eb2a431bd1..ec0d6fde0d 100644 --- a/lib/kernels/include/kernels/accessor.h +++ b/lib/kernels/include/kernels/accessor.h @@ -1,25 +1,29 @@ -#ifndef _FLEXFLOW_KERNELS_ACCESSOR_H -#define _FLEXFLOW_KERNELS_ACCESSOR_H +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ACCESSOR_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ACCESSOR_H -#include "kernels/array_shape.h" #include "kernels/device.h" #include "kernels/ff_handle.h" +#include "kernels/legion_dim.h" +#include "kernels/legion_ordered/legion_ordered.h" #include "op-attrs/datatype.h" +#include "op-attrs/tensor_dims.dtg.h" +#include "op-attrs/tensor_dims.h" +#include "op-attrs/tensor_shape.dtg.h" #include "pcg/device_type.dtg.h" #include "utils/containers/transform.h" #include +#include namespace FlexFlow { -nonnegative_int - calculate_accessor_offset(LegionOrdered const &, - ArrayShape const &); +nonnegative_int calculate_accessor_offset(TensorDimsCoord const &, + TensorDims const &); class GenericTensorAccessorR { public: template typename data_type_enum_to_class
::type const *get() const { - ASSERT(this->data_type == DT, "Invalid datatype requested"); + ASSERT(this->shape.data_type == DT, "Invalid datatype requested"); return static_cast const *>(this->ptr); } @@ -32,8 +36,7 @@ class GenericTensorAccessorR { GenericTensorAccessorR() = delete; - GenericTensorAccessorR(DataType data_type, - ArrayShape const &shape, + GenericTensorAccessorR(TensorShape const &shape, void const *ptr, DeviceType device_type); @@ -41,32 +44,25 @@ class GenericTensorAccessorR { bool operator!=(GenericTensorAccessorR const &) const; template - real_type_t
const &at(FFOrdered const &indices) const { - return this->at
(legion_ordered_from_ff_ordered(indices)); - } - - template - real_type_t
const & - at(LegionOrdered const &indices) const { + real_type_t
const &at(TensorDimsCoord const &indices) const { ASSERT(this->device_type == DeviceType::CPU, "GenericTensorAccessorR::at() requires CPU-allocated tensor"); - ASSERT(this->data_type == DT, "Invalid datatype requested"); + ASSERT(this->shape.data_type == DT, "Invalid datatype requested"); using T = real_type_t
; T const *data_ptr = static_cast(this->ptr); - nonnegative_int offset = calculate_accessor_offset(indices, this->shape); + nonnegative_int offset = + calculate_accessor_offset(indices, this->shape.dims); return data_ptr[offset.unwrap_nonnegative()]; } public: - DataType data_type; - ArrayShape shape; + TensorShape shape; void const *ptr; DeviceType device_type; private: - std::tuple tie() const; @@ -79,7 +75,7 @@ class GenericTensorAccessorW { public: template typename data_type_enum_to_class
::type *get() const { - ASSERT(this->data_type == DT, "Invalid datatype requested"); + ASSERT(this->shape.data_type == DT, "Invalid datatype requested"); return static_cast *>(this->ptr); } @@ -92,8 +88,7 @@ class GenericTensorAccessorW { GenericTensorAccessorW() = delete; - GenericTensorAccessorW(DataType data_type, - ArrayShape const &shape, + GenericTensorAccessorW(TensorShape const &shape, void *ptr, DeviceType device_type); @@ -103,48 +98,38 @@ class GenericTensorAccessorW { operator GenericTensorAccessorR() const; template - real_type_t
&at(FFOrdered const &indices) { - return this->at
(legion_ordered_from_ff_ordered(indices)); - } - - template - real_type_t
&at(LegionOrdered const &indices) { + real_type_t
&at(TensorDimsCoord const &indices) { ASSERT(this->device_type == DeviceType::CPU, "GenericTensorAccessorW::at() requires CPU-allocated tensor"); - ASSERT(this->data_type == DT, "Invalid datatype requested"); + ASSERT(this->shape.data_type == DT, "Invalid datatype requested"); using T = real_type_t
; T *data_ptr = static_cast(this->ptr); - nonnegative_int offset = calculate_accessor_offset(indices, this->shape); + nonnegative_int offset = + calculate_accessor_offset(indices, this->shape.dims); return data_ptr[offset.unwrap_nonnegative()]; } template - real_type_t
const &at(FFOrdered const &indices) const { - return this->at
(legion_ordered_from_ff_ordered(indices)); - } - - template - real_type_t
&at(LegionOrdered const &indices) const { + real_type_t
&at(TensorDimsCoord const &indices) const { ASSERT(this->device_type == DeviceType::CPU, "GenericTensorAccessorW::at() requires CPU-allocated tensor"); - ASSERT(this->data_type == DT, "Invalid datatype requested"); + ASSERT(this->shape.data_type == DT, "Invalid datatype requested"); using T = real_type_t
; - T const *data_ptr = static_cast(this->ptr); - nonnegative_int offset = calculate_accessor_offset(indices, this->shape); - return data_ptr[offset]; + T *data_ptr = static_cast(this->ptr); + nonnegative_int offset = + calculate_accessor_offset(indices, this->shape.dims); + return data_ptr[offset.unwrap_nonnegative()]; } public: - DataType data_type; - ArrayShape shape; + TensorShape shape; void *ptr; DeviceType device_type; private: - std::tuple tie() const; @@ -156,7 +141,7 @@ std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &); template typename data_type_enum_to_class
::type * get(GenericTensorAccessorW const &a) { - ASSERT(a.data_type == DT, "Invalid datatype requested"); + ASSERT(a.shape.data_type == DT, "Invalid datatype requested"); return static_cast *>(a.ptr); } @@ -173,7 +158,7 @@ std::vector *> template typename data_type_enum_to_class
::type const * get(GenericTensorAccessorR const &a) { - ASSERT(a.data_type == DT, "Invalid datatype requested"); + ASSERT(a.shape.data_type == DT, "Invalid datatype requested"); return static_cast const *>(a.ptr); } @@ -221,30 +206,16 @@ std::vector const *> GenericTensorAccessorR read_only_accessor_from_write_accessor( GenericTensorAccessorW const &write_accessor); -bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1, - GenericTensorAccessorR const &acc2); -bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1, - GenericTensorAccessorW const &acc2); - -bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, - ArrayShape const &expected_shape, - DataType const &expected_dtype); -bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor, - ArrayShape const &expected_shape, - DataType const &expected_dtype); - -std::pair - get_shape_and_datatype(GenericTensorAccessorR const &accessor); -std::pair - get_shape_and_datatype(GenericTensorAccessorW const &accessor); +TensorShape get_tensor_shape_for_accessor_r(GenericTensorAccessorR const &); +TensorShape get_tensor_shape_for_accessor_w(GenericTensorAccessorW const &); -void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor, +void copy_accessor_data_to_l_from_r(GenericTensorAccessorW const &dst_accessor, GenericTensorAccessorR const &src_accessor); template real_type_t
accessor_get_only_value(GenericTensorAccessorR const &acc) { - ASSERT(get_num_elements(acc.shape) == 1); - ASSERT(acc.data_type == DT); + ASSERT(get_num_elements(acc.shape.dims) == 1); + ASSERT(acc.shape.data_type == DT); return *static_cast const *>(acc.ptr); } diff --git a/lib/kernels/include/kernels/allocation.h b/lib/kernels/include/kernels/allocation.h index 39bad6599c..0863e2d0ac 100644 --- a/lib/kernels/include/kernels/allocation.h +++ b/lib/kernels/include/kernels/allocation.h @@ -1,5 +1,5 @@ -#ifndef _FLEXFLOW_KERNELS_ALLOCATION_H -#define _FLEXFLOW_KERNELS_ALLOCATION_H +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ALLOCATION_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ALLOCATION_H #include "kernels/accessor.h" #include diff --git a/lib/kernels/include/kernels/array_coord.h b/lib/kernels/include/kernels/array_coord.h deleted file mode 100644 index 730bb49e81..0000000000 --- a/lib/kernels/include/kernels/array_coord.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ARRAY_COORD_H -#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ARRAY_COORD_H - -#include "kernels/array_coord.dtg.h" - -namespace FlexFlow { - -ArrayCoord - array_coord_drop_dims(ArrayCoord const &coord, - std::function const &should_drop_dim); - -} // namespace FlexFlow - -#endif diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h deleted file mode 100644 index 2b1397dc0e..0000000000 --- a/lib/kernels/include/kernels/array_shape.h +++ /dev/null @@ -1,79 +0,0 @@ -#ifndef _FLEXFLOW_KERNELS_ARRAY_SHAPE_H -#define _FLEXFLOW_KERNELS_ARRAY_SHAPE_H - -#include "kernels/array_coord.dtg.h" -#include "kernels/legion_dim.h" -#include "op-attrs/tensor_shape.dtg.h" -#include "utils/positive_int/positive_int.h" -#include "utils/stack_vector/stack_vector.h" -#include "utils/visitable.h" -#include -#include -#include - -namespace FlexFlow { - -struct ArrayShape { -public: - ArrayShape() = delete; - explicit ArrayShape(LegionOrdered const &dims); - - positive_int num_elements() const; - - nonnegative_int num_dims() const; - - positive_int operator[](legion_dim_t) const; - positive_int at(legion_dim_t) const; - positive_int at(ff_dim_t) const; - - bool operator==(ArrayShape const &) const; - bool operator!=(ArrayShape const &) const; - - legion_dim_t last_idx() const; - legion_dim_t neg_idx(int) const; - - std::optional at_maybe(legion_dim_t) const; - std::optional at_maybe(ff_dim_t) const; - - ArrayShape sub_shape(ff_dim_t const &start, - std::optional const &end) const; - - ArrayShape sub_shape(legion_dim_t const &start, - std::optional const &end) const; - -public: - LegionOrdered dims; - -private: - std::tuple tie() const; - - friend ::std::hash; -}; - -std::string format_as(ArrayShape const &); -std::ostream &operator<<(std::ostream &, ArrayShape const &); - -positive_int get_num_elements(ArrayShape const &); - -ArrayShape array_shape_from_tensor_shape(TensorShape const &); -TensorShape get_tensor_shape(ArrayShape const &, DataType); - -std::unordered_set get_ff_dim_t_set(ArrayShape const &); -std::unordered_set get_array_coord_set(ArrayShape const &); - -ArrayShape - array_shape_drop_dims(ArrayShape const &shape, - std::function const &should_drop_dim); - -} // namespace FlexFlow - -namespace std { - -template <> -struct hash<::FlexFlow::ArrayShape> { - size_t operator()(::FlexFlow::ArrayShape const &) const; -}; - -} // namespace std - -#endif diff --git a/lib/kernels/include/kernels/attention_kernels.h b/lib/kernels/include/kernels/attention_kernels.h index b3c77d3430..0c5caabfa0 100644 --- a/lib/kernels/include/kernels/attention_kernels.h +++ b/lib/kernels/include/kernels/attention_kernels.h @@ -3,93 +3,39 @@ #include "kernels/allocation.h" #include "kernels/device.h" -#include "kernels/ff_handle.h" -#include - -namespace FlexFlow { - -struct MHAPerDeviceState { - PerDeviceFFHandle handle; - size_t weightSize; - size_t reserveSpaceSize; - ffAttnDescriptor_t attnDesc; - ffSeqDataDescriptor_t qDesc; - ffSeqDataDescriptor_t kDesc; - ffSeqDataDescriptor_t vDesc; - ffSeqDataDescriptor_t oDesc; - int *devQoSeqArray; - int *devKvSeqArray; - int *loWinIdx; - int *hiWinIdx; - void *reserveSpace; - Allocator allocator; - - bool operator==(MHAPerDeviceState const &other) const; - bool operator!=(MHAPerDeviceState const &other) const; - -private: - std::tuple - tie() const; -}; - -FF_VISITABLE_STRUCT_NO_EQ(MHAPerDeviceState, - handle, - weightSize, - reserveSpaceSize, - attnDesc, - qDesc, - kDesc, - vDesc, - oDesc, - devQoSeqArray, - devKvSeqArray, - loWinIdx, - hiWinIdx, - reserveSpace, - allocator); - -std::string format_as(MHAPerDeviceState const &x); -std::ostream &operator<<(std::ostream &s, MHAPerDeviceState const &x); - -namespace Kernels::MultiHeadAttention { - -MHAPerDeviceState init_kernel(PerDeviceFFHandle const &, - Allocator &, - int num_samples, - int num_heads, - int qSize, - int kSize, - int vSize, - int qProjSize, - int kProjSize, - int vProjSize, - int oProjSize, - int qoSeqLength, - int kvSeqLength, - bool add_bias_kv); - -void forward_kernel(ffStream_t stream, - MHAPerDeviceState const &device_state, +#include "kernels/device_handle_t.dtg.h" +#include "kernels/device_stream_t.dtg.h" +#include "kernels/mha_per_device_state.dtg.h" + +namespace FlexFlow::Kernels::MultiHeadAttention { + +std::optional + init_kernel(DeviceType device_type, + device_handle_t const &per_device_ff_handle, + Allocator &allocator, + int num_samples, + int num_heads, + int qSize, + int kSize, + int vSize, + int qProjSize, + int kProjSize, + int vProjSize, + int oProjSize, + int qoSeqLength, + int kvSeqLength, + bool add_bias_kv); + +void forward_kernel(device_stream_t const &stream, + std::optional const &device_state, float const *query_ptr, float const *key_ptr, float const *value_ptr, float const *weight_ptr, float *output_ptr); -void backward_kernel(ffStream_t stream, - MHAPerDeviceState const &device_state, +void backward_kernel(device_stream_t const &stream, + std::optional const &device_state, float const *query_ptr, float *query_grad_ptr, float const *key_ptr, @@ -100,10 +46,10 @@ void backward_kernel(ffStream_t stream, float *weight_grad_ptr, float const *output_grad_ptr); -void cleanup_kernel(Allocator &allocator, - MHAPerDeviceState const &device_state); +void cleanup_kernel(DeviceType device_type, + Allocator &allocator, + std::optional const &device_state); -} // namespace Kernels::MultiHeadAttention -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::MultiHeadAttention #endif diff --git a/lib/kernels/include/kernels/attention_kernels_cpu.h b/lib/kernels/include/kernels/attention_kernels_cpu.h new file mode 100644 index 0000000000..3dfdb45d42 --- /dev/null +++ b/lib/kernels/include/kernels/attention_kernels_cpu.h @@ -0,0 +1,31 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ATTENTION_KERNELS_CPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ATTENTION_KERNELS_CPU_H + +#include "kernels/allocation.h" +#include "kernels/device.h" +#include "kernels/device_stream_t.dtg.h" +#include "kernels/ff_handle.h" +#include "kernels/mha_per_device_state.dtg.h" +#include + +namespace FlexFlow::Kernels::MultiHeadAttention { + +void cpu_forward_kernel(float const *query_ptr, + float const *key_ptr, + float const *value_ptr, + float const *weight_ptr, + float *output_ptr); + +void cpu_backward_kernel(float const *query_ptr, + float *query_grad_ptr, + float const *key_ptr, + float *key_grad_ptr, + float const *value_ptr, + float *value_grad_ptr, + float const *weight_ptr, + float *weight_grad_ptr, + float const *output_grad_ptr); + +} // namespace FlexFlow::Kernels::MultiHeadAttention + +#endif diff --git a/lib/kernels/include/kernels/attention_kernels_gpu.h b/lib/kernels/include/kernels/attention_kernels_gpu.h new file mode 100644 index 0000000000..655477a6b0 --- /dev/null +++ b/lib/kernels/include/kernels/attention_kernels_gpu.h @@ -0,0 +1,52 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ATTENTION_KERNELS_GPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ATTENTION_KERNELS_GPU_H + +#include "kernels/allocation.h" +#include "kernels/device.h" +#include "kernels/device_stream_t.dtg.h" +#include "kernels/ff_handle.h" +#include "kernels/mha_per_device_state.dtg.h" + +namespace FlexFlow::Kernels::MultiHeadAttention { + +MHAPerDeviceState gpu_init_kernel(PerDeviceFFHandle const &, + Allocator &, + int num_samples, + int num_heads, + int qSize, + int kSize, + int vSize, + int qProjSize, + int kProjSize, + int vProjSize, + int oProjSize, + int qoSeqLength, + int kvSeqLength, + bool add_bias_kv); + +void gpu_forward_kernel(ffStream_t stream, + MHAPerDeviceState const &device_state, + float const *query_ptr, + float const *key_ptr, + float const *value_ptr, + float const *weight_ptr, + float *output_ptr); + +void gpu_backward_kernel(ffStream_t stream, + MHAPerDeviceState const &device_state, + float const *query_ptr, + float *query_grad_ptr, + float const *key_ptr, + float *key_grad_ptr, + float const *value_ptr, + float *value_grad_ptr, + float const *weight_ptr, + float *weight_grad_ptr, + float const *output_grad_ptr); + +void gpu_cleanup_kernel(Allocator &allocator, + MHAPerDeviceState const &device_state); + +} // namespace FlexFlow::Kernels::MultiHeadAttention + +#endif diff --git a/lib/kernels/include/kernels/batch_matmul_kernels.h b/lib/kernels/include/kernels/batch_matmul_kernels.h index 8b67f564d2..db377162b6 100644 --- a/lib/kernels/include/kernels/batch_matmul_kernels.h +++ b/lib/kernels/include/kernels/batch_matmul_kernels.h @@ -1,14 +1,14 @@ #ifndef _FLEXFLOW_OPS_KERNELS_BATCH_MATMUL_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_BATCH_MATMUL_KERNELS_H -#include "kernels/allocation.h" -#include "kernels/device.h" +#include "kernels/device_handle_t.dtg.h" +#include "kernels/device_stream_t.dtg.h" #include "kernels/ff_handle.h" namespace FlexFlow::Kernels::BatchMatmul { -void forward_kernel(ffStream_t stream, - PerDeviceFFHandle const &handle, +void forward_kernel(device_stream_t const &stream, + device_handle_t const &handle, float *output_ptr, float const *a_input_ptr, float const *b_input_ptr, @@ -20,8 +20,8 @@ void forward_kernel(ffStream_t stream, int a_seq_length_dim, int b_seq_length_dim); -void backward_kernel(ffStream_t stream, - PerDeviceFFHandle const &handle, +void backward_kernel(device_stream_t const &stream, + device_handle_t const &handle, float const *o_ptr, float const *o_grad_ptr, float const *a_ptr, diff --git a/lib/kernels/include/kernels/batch_matmul_kernels_cpu.h b/lib/kernels/include/kernels/batch_matmul_kernels_cpu.h new file mode 100644 index 0000000000..fdef3d7fa1 --- /dev/null +++ b/lib/kernels/include/kernels/batch_matmul_kernels_cpu.h @@ -0,0 +1,32 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_BATCH_MATMUL_KERNELS_CPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_BATCH_MATMUL_KERNELS_CPU_H + +#include "kernels/allocation.h" + +namespace FlexFlow::Kernels::BatchMatmul { + +void cpu_forward_kernel(float *output_ptr, + float const *a_input_ptr, + float const *b_input_ptr, + int m, + int n, + int k, + int batch, + int seq_length, + int a_seq_length_dim, + int b_seq_length_dim); + +void cpu_backward_kernel(float const *o_ptr, + float const *o_grad_ptr, + float const *a_ptr, + float *a_grad_ptr, + float const *b_ptr, + float *b_grad_ptr, + int m, + int n, + int k, + int batch); + +} // namespace FlexFlow::Kernels::BatchMatmul + +#endif diff --git a/lib/kernels/include/kernels/batch_matmul_kernels_gpu.h b/lib/kernels/include/kernels/batch_matmul_kernels_gpu.h new file mode 100644 index 0000000000..4a35c000c3 --- /dev/null +++ b/lib/kernels/include/kernels/batch_matmul_kernels_gpu.h @@ -0,0 +1,38 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_BATCH_MATMUL_KERNELS_GPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_BATCH_MATMUL_KERNELS_GPU_H + +#include "kernels/allocation.h" +#include "kernels/device.h" +#include "kernels/ff_handle.h" + +namespace FlexFlow::Kernels::BatchMatmul { + +void gpu_forward_kernel(ffStream_t stream, + PerDeviceFFHandle const &handle, + float *output_ptr, + float const *a_input_ptr, + float const *b_input_ptr, + int m, + int n, + int k, + int batch, + int seq_length, + int a_seq_length_dim, + int b_seq_length_dim); + +void gpu_backward_kernel(ffStream_t stream, + PerDeviceFFHandle const &handle, + float const *o_ptr, + float const *o_grad_ptr, + float const *a_ptr, + float *a_grad_ptr, + float const *b_ptr, + float *b_grad_ptr, + int m, + int n, + int k, + int batch); + +} // namespace FlexFlow::Kernels::BatchMatmul + +#endif diff --git a/lib/kernels/include/kernels/batch_norm_kernels.h b/lib/kernels/include/kernels/batch_norm_kernels.h index 9bb2753a12..47cb3d85a8 100644 --- a/lib/kernels/include/kernels/batch_norm_kernels.h +++ b/lib/kernels/include/kernels/batch_norm_kernels.h @@ -3,29 +3,31 @@ #include "kernels/allocation.h" #include "kernels/batch_norm_per_device_state.dtg.h" -#include "kernels/device.h" +#include "kernels/device_handle_t.dtg.h" +#include "kernels/device_stream_t.dtg.h" #include "kernels/ff_handle.h" -#include namespace FlexFlow::Kernels::BatchNorm { -BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle, - Allocator allocator, - float *runningMean, - int output_n, - int output_c, - int output_h, - int output_w, - bool relu); +std::optional + init_kernel(DeviceType device_type, + device_handle_t const &handle, + Allocator &allocator, + float *runningMean, + int output_n, + int output_c, + int output_h, + int output_w, + bool relu); -void forward_kernel(ffStream_t stream, - BatchNormPerDeviceState const &per_device_statem, +void forward_kernel(device_stream_t const &stream, + BatchNormPerDeviceState const &per_device_state, float const *input_ptr, float *output_ptr, float const *scale_ptr, float const *bias_ptr); -void backward_kernel(ffStream_t stream, +void backward_kernel(device_stream_t const &stream, BatchNormPerDeviceState const &per_device_state, float const *output_ptr, float *output_grad_ptr, @@ -36,13 +38,10 @@ void backward_kernel(ffStream_t stream, float *bias_grad_ptr, size_t numElements); -void cleanup_kernel(Allocator allocator, - ffTensorDescriptor_t inputTensor, - ffTensorDescriptor_t biasTensor, - ffTensorDescriptor_t outputTensor, - ffActivationDescriptor_t actiDesc, - bool relu, - float *runningMean); +void cleanup_kernel( + DeviceType device_type, + Allocator &allocator, + std::optional const &per_device_state); } // namespace FlexFlow::Kernels::BatchNorm #endif diff --git a/lib/kernels/include/kernels/batch_norm_kernels_cpu.h b/lib/kernels/include/kernels/batch_norm_kernels_cpu.h new file mode 100644 index 0000000000..8c564d6450 --- /dev/null +++ b/lib/kernels/include/kernels/batch_norm_kernels_cpu.h @@ -0,0 +1,28 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_BATCH_NORM_KERNELS_CPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_BATCH_NORM_KERNELS_CPU_H + +#include "kernels/allocation.h" +#include "kernels/batch_norm_per_device_state.dtg.h" +#include "kernels/device_stream_t.dtg.h" + +namespace FlexFlow::Kernels::BatchNorm { + +void cpu_forward_kernel(BatchNormPerDeviceState const &per_device_state, + float const *input_ptr, + float *output_ptr, + float const *scale_ptr, + float const *bias_ptr); + +void cpu_backward_kernel(BatchNormPerDeviceState const &per_device_state, + float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, + float const *scale_ptr, + float *scale_grad_ptr, + float *bias_grad_ptr, + size_t numElements); + +} // namespace FlexFlow::Kernels::BatchNorm + +#endif diff --git a/lib/kernels/include/kernels/batch_norm_kernels_gpu.h b/lib/kernels/include/kernels/batch_norm_kernels_gpu.h new file mode 100644 index 0000000000..41f9808bff --- /dev/null +++ b/lib/kernels/include/kernels/batch_norm_kernels_gpu.h @@ -0,0 +1,43 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_BATCH_NORM_KERNELS_GPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_BATCH_NORM_KERNELS_GPU_H + +#include "kernels/allocation.h" +#include "kernels/batch_norm_per_device_state.dtg.h" +#include "kernels/device.h" +#include "kernels/ff_handle.h" + +namespace FlexFlow::Kernels::BatchNorm { + +BatchNormPerDeviceState gpu_init_kernel(PerDeviceFFHandle const &handle, + Allocator &allocator, + float *runningMean, + int output_n, + int output_c, + int output_h, + int output_w, + bool relu); + +void gpu_forward_kernel(ffStream_t stream, + BatchNormPerDeviceState const &per_device_statem, + float const *input_ptr, + float *output_ptr, + float const *scale_ptr, + float const *bias_ptr); + +void gpu_backward_kernel(ffStream_t stream, + BatchNormPerDeviceState const &per_device_state, + float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, + float const *scale_ptr, + float *scale_grad_ptr, + float *bias_grad_ptr, + size_t numElements); + +void gpu_cleanup_kernel(Allocator &allocator, + BatchNormPerDeviceState &per_device_state); + +} // namespace FlexFlow::Kernels::BatchNorm + +#endif diff --git a/lib/kernels/include/kernels/cast_kernels.h b/lib/kernels/include/kernels/cast_kernels.h index 5ec4cb3975..adc64970a1 100644 --- a/lib/kernels/include/kernels/cast_kernels.h +++ b/lib/kernels/include/kernels/cast_kernels.h @@ -2,17 +2,17 @@ #define _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_H #include "kernels/accessor.h" -#include "kernels/device.h" +#include "kernels/device_stream_t.dtg.h" namespace FlexFlow::Kernels::Cast { -void forward_kernel(ffStream_t stream, +void forward_kernel(device_stream_t const &stream, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); -void backward_kernel(ffStream_t stream, - GenericTensorAccessorR const &output, - GenericTensorAccessorW const &input); +void backward_kernel(device_stream_t const &stream, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad); } // namespace FlexFlow::Kernels::Cast diff --git a/lib/kernels/include/kernels/cast_kernels_cpu.h b/lib/kernels/include/kernels/cast_kernels_cpu.h index 343ba253d9..2b3d03f097 100644 --- a/lib/kernels/include/kernels/cast_kernels_cpu.h +++ b/lib/kernels/include/kernels/cast_kernels_cpu.h @@ -2,15 +2,14 @@ #define _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H #include "kernels/accessor.h" -#include "kernels/device.h" namespace FlexFlow::Kernels::Cast { void cpu_forward_kernel(GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); -void cpu_backward_kernel(GenericTensorAccessorR const &output, - GenericTensorAccessorW const &input); +void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad); } // namespace FlexFlow::Kernels::Cast diff --git a/lib/kernels/include/kernels/cast_kernels_gpu.h b/lib/kernels/include/kernels/cast_kernels_gpu.h new file mode 100644 index 0000000000..47336804e9 --- /dev/null +++ b/lib/kernels/include/kernels/cast_kernels_gpu.h @@ -0,0 +1,19 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_CAST_KERNELS_GPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_CAST_KERNELS_GPU_H + +#include "kernels/accessor.h" +#include "kernels/device.h" + +namespace FlexFlow::Kernels::Cast { + +void gpu_forward_kernel(ffStream_t stream, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void gpu_backward_kernel(ffStream_t stream, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad); + +} // namespace FlexFlow::Kernels::Cast + +#endif diff --git a/lib/kernels/include/kernels/combine_kernels.h b/lib/kernels/include/kernels/combine_kernels.h deleted file mode 100644 index c87465a01f..0000000000 --- a/lib/kernels/include/kernels/combine_kernels.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H -#define _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H - -#include "kernels/accessor.h" -#include "kernels/device.h" - -namespace FlexFlow::Kernels::Combine { - -void forward_kernel(ffStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output); - -void backward_kernel(ffStream_t stream, - GenericTensorAccessorR const &output_grad, - GenericTensorAccessorW const &input_grad); - -} // namespace FlexFlow::Kernels::Combine - -#endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_H diff --git a/lib/kernels/include/kernels/combine_kernels_cpu.h b/lib/kernels/include/kernels/combine_kernels_cpu.h deleted file mode 100644 index 75fdd56498..0000000000 --- a/lib/kernels/include/kernels/combine_kernels_cpu.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H -#define _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H - -#include "kernels/accessor.h" -#include "kernels/device.h" - -namespace FlexFlow::Kernels::Combine { - -void cpu_forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output); - -void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, - GenericTensorAccessorW const &input_grad); - -} // namespace FlexFlow::Kernels::Combine - -#endif // _FLEXFLOW_OPS_KERNELS_COMBINE_KERNELS_CPU_H diff --git a/lib/kernels/include/kernels/concat_kernels.h b/lib/kernels/include/kernels/concat_kernels.h index 1e3c55bf59..793bf52505 100644 --- a/lib/kernels/include/kernels/concat_kernels.h +++ b/lib/kernels/include/kernels/concat_kernels.h @@ -2,16 +2,16 @@ #define _FLEXFLOW_OPS_KERNELS_CONCAT_KERNELS_H #include "kernels/accessor.h" -#include "kernels/device.h" +#include "kernels/device_stream_t.dtg.h" namespace FlexFlow::Kernels::Concat { -void forward_kernel(ffStream_t stream, +void forward_kernel(device_stream_t const &stream, GenericTensorAccessorW const &output, std::vector const &inputs, ff_dim_t axis); -void backward_kernel(ffStream_t stream, +void backward_kernel(device_stream_t const &stream, GenericTensorAccessorR const &output_grad, std::vector const &input_grads, ff_dim_t axis); diff --git a/lib/kernels/include/kernels/concat_kernels_cpu.h b/lib/kernels/include/kernels/concat_kernels_cpu.h new file mode 100644 index 0000000000..4a7f9fd3c8 --- /dev/null +++ b/lib/kernels/include/kernels/concat_kernels_cpu.h @@ -0,0 +1,19 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_CONCAT_KERNELS_CPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_CONCAT_KERNELS_CPU_H + +#include "kernels/accessor.h" +#include "kernels/device.h" + +namespace FlexFlow::Kernels::Concat { + +void cpu_forward_kernel(GenericTensorAccessorW const &output, + std::vector const &inputs, + ff_dim_t axis); + +void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, + std::vector const &input_grads, + ff_dim_t axis); + +} // namespace FlexFlow::Kernels::Concat + +#endif diff --git a/lib/kernels/include/kernels/concat_kernels_gpu.h b/lib/kernels/include/kernels/concat_kernels_gpu.h new file mode 100644 index 0000000000..3aaf3fbe2c --- /dev/null +++ b/lib/kernels/include/kernels/concat_kernels_gpu.h @@ -0,0 +1,21 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_CONCAT_KERNELS_GPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_CONCAT_KERNELS_GPU_H + +#include "kernels/accessor.h" +#include "kernels/device.h" + +namespace FlexFlow::Kernels::Concat { + +void gpu_forward_kernel(ffStream_t stream, + GenericTensorAccessorW const &output, + std::vector const &inputs, + ff_dim_t axis); + +void gpu_backward_kernel(ffStream_t stream, + GenericTensorAccessorR const &output_grad, + std::vector const &input_grads, + ff_dim_t axis); + +} // namespace FlexFlow::Kernels::Concat + +#endif diff --git a/lib/kernels/include/kernels/conv_2d_kernels.h b/lib/kernels/include/kernels/conv_2d_kernels.h index 3b7c0672df..eb7cd7327a 100644 --- a/lib/kernels/include/kernels/conv_2d_kernels.h +++ b/lib/kernels/include/kernels/conv_2d_kernels.h @@ -2,74 +2,53 @@ #define _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H #include "kernels/accessor.h" -#include "kernels/device.h" +#include "kernels/conv_2d_per_device_state.dtg.h" +#include "kernels/device_handle_t.dtg.h" +#include "kernels/device_stream_t.dtg.h" #include "kernels/ff_handle.h" #include "op-attrs/activation.dtg.h" -#include "utils/visitable.h" -namespace FlexFlow { - -struct Conv2DPerDeviceState { - PerDeviceFFHandle handle; - ffTensorDescriptor_t inputTensor; - ffTensorDescriptor_t biasTensor; - ffTensorDescriptor_t outputTensor; - ffFilterDescriptor_t filterDesc; - ffActivationDescriptor_t actiDesc; - ffConvolutionDescriptor_t convDesc; - ffConvolutionFwdAlgo_t fwdAlgo; - ffConvolutionBwdFilterAlgo_t bwdFilterAlgo; - req bwdDataAlgo; -}; - -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Conv2DPerDeviceState, - handle, - inputTensor, - biasTensor, - outputTensor, - filterDesc, - actiDesc, - convDesc, - fwdAlgo, - bwdFilterAlgo, - bwdDataAlgo); - -namespace Kernels::Conv2D { - -Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, - std::optional activation, - int kernel_h, - int kernel_w, - int groups, - int padding_h, - int padding_w, - int stride_h, - int stride_w, - GenericTensorAccessorW const &input, - GenericTensorAccessorW const &output, - float const *filter_ptr, - float *filter_grad_ptr); - -void forward_kernel(ffStream_t stream, - Conv2DPerDeviceState const &m, +namespace FlexFlow::Kernels::Conv2D { + +std::optional + init_kernel(DeviceType device_type, + device_handle_t const &handle, + std::optional activation, + int kernel_h, + int kernel_w, + int groups, + int padding_h, + int padding_w, + int stride_h, + int stride_w, + GenericTensorAccessorW const &input, + GenericTensorAccessorW const &output, + float const *filter_ptr, + float *filter_grad_ptr); + +void forward_kernel(device_stream_t const &stream, + std::optional const &per_device_state, float const *input_ptr, float *output_ptr, float const *filter_ptr, float const *bias_ptr, std::optional activation); -void backward_kernel(ffStream_t stream, - Conv2DPerDeviceState const &m, - float const *output_ptr, - float *output_grad_ptr, - float const *input_ptr, - float *input_grad_ptr, - float const *filter_ptr, - float *filter_grad_ptr, - float *bias_grad_ptr, - std::optional activation); - -} // namespace Kernels::Conv2D -} // namespace FlexFlow +void backward_kernel( + device_stream_t const &stream, + std::optional const &per_device_state, + float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, + float const *filter_ptr, + float *filter_grad_ptr, + float *bias_grad_ptr, + std::optional activation); + +void cleanup_kernel(DeviceType device_type, + std::optional &per_device_state); + +} // namespace FlexFlow::Kernels::Conv2D #endif // _FLEXFLOW_OPS_KERNELS_CONV_2D_KERNELS_H diff --git a/lib/kernels/include/kernels/conv_2d_kernels_cpu.h b/lib/kernels/include/kernels/conv_2d_kernels_cpu.h new file mode 100644 index 0000000000..3a783a395f --- /dev/null +++ b/lib/kernels/include/kernels/conv_2d_kernels_cpu.h @@ -0,0 +1,26 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_CONV_2D_KERNELS_CPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_CONV_2D_KERNELS_CPU_H + +#include "op-attrs/activation.dtg.h" +#include + +namespace FlexFlow::Kernels::Conv2D { + +void cpu_forward_kernel(float const *input_ptr, + float *output_ptr, + float const *filter_ptr, + float const *bias_ptr, + std::optional const &activation); + +void cpu_backward_kernel(float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, + float const *filter_ptr, + float *filter_grad_ptr, + float *bias_grad_ptr, + std::optional const &activation); + +} // namespace FlexFlow::Kernels::Conv2D + +#endif diff --git a/lib/kernels/include/kernels/conv_2d_kernels_gpu.h b/lib/kernels/include/kernels/conv_2d_kernels_gpu.h new file mode 100644 index 0000000000..9084838e9d --- /dev/null +++ b/lib/kernels/include/kernels/conv_2d_kernels_gpu.h @@ -0,0 +1,44 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_CONV_2D_KERNELS_GPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_CONV_2D_KERNELS_GPU_H + +namespace FlexFlow::Kernels::Conv2D { + +Conv2DPerDeviceState + gpu_init_kernel(PerDeviceFFHandle const &handle, + std::optional const &activation, + int kernel_h, + int kernel_w, + int groups, + int padding_h, + int padding_w, + int stride_h, + int stride_w, + GenericTensorAccessorW const &input, + GenericTensorAccessorW const &output, + float const *filter_ptr, + float *filter_grad_ptr); + +void gpu_forward_kernel(ffStream_t stream, + Conv2DPerDeviceState const &m, + float const *input_ptr, + float *output_ptr, + float const *filter_ptr, + float const *bias_ptr, + std::optional activation); + +void gpu_backward_kernel(ffStream_t stream, + Conv2DPerDeviceState const &m, + float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, + float const *filter_ptr, + float *filter_grad_ptr, + float *bias_grad_ptr, + std::optional activation); + +void gpu_cleanup_kernel(Conv2DPerDeviceState &per_device_state); + +} // namespace FlexFlow::Kernels::Conv2D + +#endif diff --git a/lib/kernels/include/kernels/conv_2d_per_device_state.struct.toml b/lib/kernels/include/kernels/conv_2d_per_device_state.struct.toml new file mode 100644 index 0000000000..d76dbc89d0 --- /dev/null +++ b/lib/kernels/include/kernels/conv_2d_per_device_state.struct.toml @@ -0,0 +1,48 @@ +namespace = "FlexFlow" +name = "Conv2DPerDeviceState" +features = [] + +includes = [ + "kernels/device.h", + "kernels/ff_handle.h", +] + +[[fields]] +name = "handle" +type = "::FlexFlow::PerDeviceFFHandle" + +[[fields]] +name = "inputTensor" +type = "ffTensorDescriptor_t" + +[[fields]] +name = "biasTensor" +type = "ffTensorDescriptor_t" + +[[fields]] +name = "outputTensor" +type = "ffTensorDescriptor_t" + +[[fields]] +name = "filterDesc" +type = "ffFilterDescriptor_t" + +[[fields]] +name = "actiDesc" +type = "ffActivationDescriptor_t" + +[[fields]] +name = "convDesc" +type = "ffConvolutionDescriptor_t" + +[[fields]] +name = "fwdAlgo" +type = "ffConvolutionFwdAlgo_t" + +[[fields]] +name = "bwdFilterAlgo" +type = "ffConvolutionBwdFilterAlgo_t" + +[[fields]] +name = "bwdDataAlgo" +type = "ffConvolutionBwdDataAlgo_t" diff --git a/lib/kernels/include/kernels/copy_tensor_accessor.h b/lib/kernels/include/kernels/copy_tensor_accessor.h index 81fd59dafb..ef2254071e 100644 --- a/lib/kernels/include/kernels/copy_tensor_accessor.h +++ b/lib/kernels/include/kernels/copy_tensor_accessor.h @@ -6,7 +6,7 @@ namespace FlexFlow { -GenericTensorAccessorR +GenericTensorAccessorW copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, Allocator &allocator); diff --git a/lib/kernels/include/kernels/create_accessor_with_contents.h b/lib/kernels/include/kernels/create_accessor_with_contents.h index 9691b0c90a..3574ad0c88 100644 --- a/lib/kernels/include/kernels/create_accessor_with_contents.h +++ b/lib/kernels/include/kernels/create_accessor_with_contents.h @@ -25,8 +25,8 @@ GenericTensorAccessorW for (nonnegative_int col_idx : nonnegative_range(ncols.nonnegative_int_from_positive_int())) { - cpu_accessor.at>(FFOrdered{col_idx}) = - contents.at(col_idx.unwrap_nonnegative()); + cpu_accessor.at>(TensorDimsCoord{ + FFOrdered{col_idx}}) = contents.at(col_idx.unwrap_nonnegative()); } GenericTensorAccessorW result = allocator.allocate_tensor(shape); @@ -58,9 +58,10 @@ GenericTensorAccessorW create_2d_accessor_w_with_contents( nonnegative_range(nrows.nonnegative_int_from_positive_int())) { for (nonnegative_int col_idx : nonnegative_range(ncols.nonnegative_int_from_positive_int())) { - cpu_accessor.at>(FFOrdered{ - row_idx, col_idx}) = contents.at(row_idx.unwrap_nonnegative()) - .at(col_idx.unwrap_nonnegative()); + cpu_accessor.at>( + TensorDimsCoord{FFOrdered{row_idx, col_idx}}) = + contents.at(row_idx.unwrap_nonnegative()) + .at(col_idx.unwrap_nonnegative()); } } @@ -105,7 +106,7 @@ GenericTensorAccessorW create_3d_accessor_w_with_contents( for (nonnegative_int dim2_idx : nonnegative_range(dim2_size.nonnegative_int_from_positive_int())) { cpu_accessor.at>( - FFOrdered{dim0_idx, dim1_idx, dim2_idx}) = + TensorDimsCoord{FFOrdered{dim0_idx, dim1_idx, dim2_idx}}) = contents.at(dim0_idx.unwrap_nonnegative()) .at(dim1_idx.unwrap_nonnegative()) .at(dim2_idx.unwrap_nonnegative()); @@ -165,8 +166,8 @@ GenericTensorAccessorW create_4d_accessor_w_with_contents( nonnegative_range(dim2_size.nonnegative_int_from_positive_int())) { for (nonnegative_int dim3_idx : nonnegative_range(dim3_size.nonnegative_int_from_positive_int())) { - accessor.at>( - FFOrdered{dim0_idx, dim1_idx, dim2_idx, dim3_idx}) = + accessor.at>(TensorDimsCoord{ + FFOrdered{dim0_idx, dim1_idx, dim2_idx, dim3_idx}}) = contents.at(dim0_idx.unwrap_nonnegative()) .at(dim1_idx.unwrap_nonnegative()) .at(dim2_idx.unwrap_nonnegative()) diff --git a/lib/kernels/include/kernels/create_local_allocator_for_device_type.h b/lib/kernels/include/kernels/create_local_allocator_for_device_type.h new file mode 100644 index 0000000000..16c35f86fd --- /dev/null +++ b/lib/kernels/include/kernels/create_local_allocator_for_device_type.h @@ -0,0 +1,12 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ALLOCATOR_FOR_DEVICE_TYPE_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ALLOCATOR_FOR_DEVICE_TYPE_H + +#include "kernels/allocation.h" + +namespace FlexFlow { + +Allocator create_local_allocator_for_device_type(DeviceType); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/device_handle_t.h b/lib/kernels/include/kernels/device_handle_t.h new file mode 100644 index 0000000000..9b7769355e --- /dev/null +++ b/lib/kernels/include/kernels/device_handle_t.h @@ -0,0 +1,17 @@ +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DEVICE_HANDLE_T_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_DEVICE_HANDLE_T_H + +#include "kernels/device_handle_t.dtg.h" +#include "kernels/managed_per_device_ff_handle.h" + +namespace FlexFlow { + +device_handle_t device_handle_t_from_managed_handle( + std::optional const &managed_handle); + +device_handle_t gpu_make_device_handle_t(PerDeviceFFHandle const &ff_handle); +device_handle_t cpu_make_device_handle_t(); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/device_handle_t.variant.toml b/lib/kernels/include/kernels/device_handle_t.variant.toml new file mode 100644 index 0000000000..ef574e0745 --- /dev/null +++ b/lib/kernels/include/kernels/device_handle_t.variant.toml @@ -0,0 +1,16 @@ +namespace = "FlexFlow" +name = "device_handle_t" +features = [] + +includes = [ + "", + "kernels/ff_handle.h", +] + +[[values]] +type = "::FlexFlow::PerDeviceFFHandle" +key = "for_gpu" + +[[values]] +type = "std::monostate" +key = "for_cpu" diff --git a/lib/kernels/include/kernels/device_stream_t.h b/lib/kernels/include/kernels/device_stream_t.h new file mode 100644 index 0000000000..2a9b2313f6 --- /dev/null +++ b/lib/kernels/include/kernels/device_stream_t.h @@ -0,0 +1,15 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_DEVICE_STREAM_T_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_DEVICE_STREAM_T_H + +#include "kernels/device_stream_t.dtg.h" +#include "pcg/device_type.dtg.h" + +namespace FlexFlow { + +device_stream_t get_gpu_device_stream(); +device_stream_t get_cpu_device_stream(); +device_stream_t get_stream_for_device_type(DeviceType); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/device_stream_t.variant.toml b/lib/kernels/include/kernels/device_stream_t.variant.toml new file mode 100644 index 0000000000..b3f8e77171 --- /dev/null +++ b/lib/kernels/include/kernels/device_stream_t.variant.toml @@ -0,0 +1,16 @@ +namespace = "FlexFlow" +name = "device_stream_t" +features = [] + +includes = [ + "", + "kernels/device.h", +] + +[[values]] +type = "ffStream_t" +key = "gpu" + +[[values]] +type = "std::monostate" +key = "cpu" diff --git a/lib/kernels/include/kernels/dropout_kernels.h b/lib/kernels/include/kernels/dropout_kernels.h index 2cc6dd60a3..39f7238114 100644 --- a/lib/kernels/include/kernels/dropout_kernels.h +++ b/lib/kernels/include/kernels/dropout_kernels.h @@ -2,60 +2,37 @@ #define _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H #include "kernels/allocation.h" -#include "kernels/array_shape.h" -#include "kernels/device.h" -#include "kernels/ff_handle.h" +#include "kernels/device_handle_t.dtg.h" +#include "kernels/device_stream_t.dtg.h" +#include "kernels/dropout_per_device_state.dtg.h" #include -namespace FlexFlow { - -struct DropoutPerDeviceState { -public: - PerDeviceFFHandle handle; - ffTensorDescriptor_t inputTensor; - ffTensorDescriptor_t outputTensor; - ffDropoutDescriptor_t dropoutDesc; - void *reserveSpace; - void *dropoutStates; - size_t reserveSpaceSize; - req dropoutStateSize; -}; - -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(DropoutPerDeviceState, - handle, - inputTensor, - outputTensor, - dropoutDesc, - reserveSpace, - dropoutStates, - reserveSpaceSize, - dropoutStateSize); - -namespace Kernels::Dropout { - -DropoutPerDeviceState init_kernel(PerDeviceFFHandle handle, - float rate, - unsigned long long seed, - ArrayShape const &output_domain, - Allocator allocator); - -void forward_kernel(ffStream_t stream, - DropoutPerDeviceState const &m, - float const *input_ptr, - float *output_ptr); - -void backward_kernel(ffStream_t stream, - DropoutPerDeviceState const &m, - float const *output_grad_ptr, - float *input_grad_ptr); - -void cleanup_kernel(Allocator allocator, - ffTensorDescriptor_t inputTensor, - ffTensorDescriptor_t outputTensor, - ffDropoutDescriptor_t dropoutDesc, - void *dropoutStates); - -} // namespace Kernels::Dropout -} // namespace FlexFlow +namespace FlexFlow::Kernels::Dropout { + +std::optional + init_kernel(DeviceType device_type, + device_handle_t const &handle, + float rate, + unsigned long long seed, + TensorShape const &output_shape, + Allocator &allocator); + +void forward_kernel( + device_stream_t const &stream, + std::optional const &per_device_state, + float const *input_ptr, + float *output_ptr); + +void backward_kernel( + device_stream_t const &stream, + std::optional const &per_device_state, + float const *output_grad_ptr, + float *input_grad_ptr); + +void cleanup_kernel(DeviceType device_type, + Allocator &allocator, + std::optional &per_device_state); + +} // namespace FlexFlow::Kernels::Dropout #endif // _FLEXFLOW_OPS_KERNELS_DROPOUT_KERNELS_H diff --git a/lib/kernels/include/kernels/dropout_kernels_cpu.h b/lib/kernels/include/kernels/dropout_kernels_cpu.h new file mode 100644 index 0000000000..8d107a8b82 --- /dev/null +++ b/lib/kernels/include/kernels/dropout_kernels_cpu.h @@ -0,0 +1,12 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_DROPOUT_KERNELS_CPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_DROPOUT_KERNELS_CPU_H + +namespace FlexFlow::Kernels::Dropout { + +void cpu_forward_kernel(float const *input_ptr, float *output_ptr); + +void cpu_backward_kernel(float const *output_grad_ptr, float *input_grad_ptr); + +} // namespace FlexFlow::Kernels::Dropout + +#endif diff --git a/lib/kernels/include/kernels/dropout_kernels_gpu.h b/lib/kernels/include/kernels/dropout_kernels_gpu.h new file mode 100644 index 0000000000..1e75253499 --- /dev/null +++ b/lib/kernels/include/kernels/dropout_kernels_gpu.h @@ -0,0 +1,33 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_DROPOUT_KERNELS_GPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_DROPOUT_KERNELS_GPU_H + +#include "kernels/allocation.h" +#include "kernels/device_stream_t.dtg.h" +#include "kernels/dropout_per_device_state.dtg.h" +#include "kernels/ff_handle.h" +#include + +namespace FlexFlow::Kernels::Dropout { + +DropoutPerDeviceState gpu_init_kernel(PerDeviceFFHandle const &handle, + float rate, + unsigned long long seed, + TensorShape const &output_shape, + Allocator &allocator); + +void gpu_forward_kernel(ffStream_t stream, + DropoutPerDeviceState const &per_device_state, + float const *input_ptr, + float *output_ptr); + +void gpu_backward_kernel(ffStream_t stream, + DropoutPerDeviceState const &per_device_state, + float const *output_grad_ptr, + float *input_grad_ptr); + +void gpu_cleanup_kernel(Allocator &allocator, + DropoutPerDeviceState const &per_device_state); + +} // namespace FlexFlow::Kernels::Dropout + +#endif diff --git a/lib/kernels/include/kernels/dropout_per_device_state.struct.toml b/lib/kernels/include/kernels/dropout_per_device_state.struct.toml new file mode 100644 index 0000000000..ffd8bf37e9 --- /dev/null +++ b/lib/kernels/include/kernels/dropout_per_device_state.struct.toml @@ -0,0 +1,40 @@ +namespace = "FlexFlow" +name = "DropoutPerDeviceState" +features = [] + +includes = [ + "kernels/device.h", + "kernels/ff_handle.h", +] + +[[fields]] +name = "handle" +type = "PerDeviceFFHandle" + +[[fields]] +name = "inputTensor" +type = "ffTensorDescriptor_t" + +[[fields]] +name = "outputTensor" +type = "ffTensorDescriptor_t" + +[[fields]] +name = "dropoutDesc" +type = "ffDropoutDescriptor_t" + +[[fields]] +name = "reserveSpace" +type = "void *" + +[[fields]] +name = "dropoutStates" +type = "void *" + +[[fields]] +name = "reserveSpaceSize" +type = "size_t" + +[[fields]] +name = "dropoutStateSize" +type = "size_t" diff --git a/lib/kernels/include/kernels/element_binary_kernels.h b/lib/kernels/include/kernels/element_binary_kernels.h index fd596f2ccf..8c9a405e6f 100644 --- a/lib/kernels/include/kernels/element_binary_kernels.h +++ b/lib/kernels/include/kernels/element_binary_kernels.h @@ -1,63 +1,55 @@ #ifndef _FLEXFLOW_OPS_KERNELS_ELEMENT_BINARY_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_ELEMENT_BINARY_KERNELS_H -#include "ff_handle.h" -#include "kernels/array_shape.h" #include "kernels/device.h" +#include "kernels/device_handle_t.dtg.h" +#include "kernels/device_stream_t.dtg.h" +#include "kernels/element_binary_per_device_state.dtg.h" +#include "kernels/ff_handle.h" #include "op-attrs/datatype.h" #include "op-attrs/operator_type.h" - -namespace FlexFlow { - -struct ElementBinaryPerDeviceState { - PerDeviceFFHandle handle; - ffTensorDescriptor_t inputLHSTensor; - ffTensorDescriptor_t inputRHSTensor; - ffTensorDescriptor_t outputTensor; - ffOpTensorDescriptor_t opDesc; - ffReduceTensorDescriptor_t reduceAddDesc; -}; - -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(ElementBinaryPerDeviceState, - handle, - inputLHSTensor, - inputRHSTensor, - outputTensor, - opDesc, - reduceAddDesc); - -namespace Kernels::ElementBinary { - -ElementBinaryPerDeviceState init_kernel(PerDeviceFFHandle handle, - OperatorType op_type, - bool should_broadcast_lhs, - bool should_broadcast_rhs, - ArrayShape lhs_shape, - ArrayShape rhs_shape, - ArrayShape output_shape); - -void forward_kernel(ffStream_t stream, - ElementBinaryPerDeviceState const &m, - float const *lhs_ptr, - float const *rhs_ptr, - float *out_ptr, - OperatorType op_type, - bool broadcast_inputLHS, - PerDeviceFFHandle handle); - -void backward_kernel(ffStream_t stream, - ElementBinaryPerDeviceState const &m, - float const *out_grad_ptr, - float const *lhs_ptr, - float const *rhs_ptr, - float *lhs_grad_ptr, - float *rhs_grad_ptr, - OperatorType op_type, - bool broadcast_inputLHS, - bool broadcast_inputRHS, - PerDeviceFFHandle handle); - -} // namespace Kernels::ElementBinary -} // namespace FlexFlow +#include "op-attrs/tensor_shape.dtg.h" +#include "pcg/device_type.dtg.h" + +namespace FlexFlow::Kernels::ElementBinary { + +std::optional + init_kernel(DeviceType device_type, + device_handle_t const &handle, + OperatorType op_type, + bool should_broadcast_lhs, + bool should_broadcast_rhs, + TensorShape const &lhs_shape, + TensorShape const &rhs_shape, + TensorShape const &output_shape); + +void forward_kernel( + device_stream_t const &stream, + std::optional const &per_device_state, + float const *lhs_ptr, + float const *rhs_ptr, + float *out_ptr, + OperatorType op_type, + bool broadcast_inputLHS, + device_handle_t const &handle); + +void backward_kernel( + device_stream_t const &stream, + std::optional const &per_device_state, + float const *out_grad_ptr, + float const *lhs_ptr, + float const *rhs_ptr, + float *lhs_grad_ptr, + float *rhs_grad_ptr, + OperatorType op_type, + bool broadcast_inputLHS, + bool broadcast_inputRHS, + device_handle_t const &handle); + +void cleanup_kernel( + DeviceType device_type, + std::optional const &per_device_state); + +} // namespace FlexFlow::Kernels::ElementBinary #endif diff --git a/lib/kernels/include/kernels/element_binary_kernels_cpu.h b/lib/kernels/include/kernels/element_binary_kernels_cpu.h new file mode 100644 index 0000000000..c53920764c --- /dev/null +++ b/lib/kernels/include/kernels/element_binary_kernels_cpu.h @@ -0,0 +1,25 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ELEMENT_BINARY_KERNELS_CPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ELEMENT_BINARY_KERNELS_CPU_H + +#include "op-attrs/operator_type.dtg.h" + +namespace FlexFlow::Kernels::ElementBinary { + +void cpu_forward_kernel(float const *lhs_ptr, + float const *rhs_ptr, + float *out_ptr, + OperatorType op_type, + bool broadcast_inputLHS); + +void cpu_backward_kernel(float const *out_grad_ptr, + float const *lhs_ptr, + float const *rhs_ptr, + float *lhs_grad_ptr, + float *rhs_grad_ptr, + OperatorType op_type, + bool broadcast_inputLHS, + bool broadcast_inputRHS); + +} // namespace FlexFlow::Kernels::ElementBinary + +#endif diff --git a/lib/kernels/include/kernels/element_binary_kernels_gpu.h b/lib/kernels/include/kernels/element_binary_kernels_gpu.h new file mode 100644 index 0000000000..58a06edb4d --- /dev/null +++ b/lib/kernels/include/kernels/element_binary_kernels_gpu.h @@ -0,0 +1,43 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ELEMENT_BINARY_KERNELS_GPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ELEMENT_BINARY_KERNELS_GPU_H + +#include "kernels/element_binary_per_device_state.dtg.h" +#include "op-attrs/operator_type.h" +#include "op-attrs/tensor_shape.dtg.h" + +namespace FlexFlow::Kernels::ElementBinary { + +ElementBinaryPerDeviceState gpu_init_kernel(PerDeviceFFHandle handle, + OperatorType op_type, + bool should_broadcast_lhs, + bool should_broadcast_rhs, + TensorShape const &lhs_shape, + TensorShape const &rhs_shape, + TensorShape const &output_shape); + +void gpu_forward_kernel(ffStream_t stream, + ElementBinaryPerDeviceState const &per_device_state, + float const *lhs_ptr, + float const *rhs_ptr, + float *out_ptr, + OperatorType op_type, + bool broadcast_inputLHS, + PerDeviceFFHandle handle); + +void gpu_backward_kernel(ffStream_t stream, + ElementBinaryPerDeviceState const &per_device_state, + float const *out_grad_ptr, + float const *lhs_ptr, + float const *rhs_ptr, + float *lhs_grad_ptr, + float *rhs_grad_ptr, + OperatorType op_type, + bool broadcast_inputLHS, + bool broadcast_inputRHS, + PerDeviceFFHandle handle); + +void gpu_cleanup_kernel(ElementBinaryPerDeviceState const &per_device_state); + +} // namespace FlexFlow::Kernels::ElementBinary + +#endif diff --git a/lib/kernels/include/kernels/element_binary_per_device_state.struct.toml b/lib/kernels/include/kernels/element_binary_per_device_state.struct.toml new file mode 100644 index 0000000000..2cae58f847 --- /dev/null +++ b/lib/kernels/include/kernels/element_binary_per_device_state.struct.toml @@ -0,0 +1,32 @@ +namespace = "FlexFlow" +name = "ElementBinaryPerDeviceState" +features = [] + +includes = [ + "kernels/ff_handle.h", + "kernels/device.h", +] + +[[fields]] +name = "handle" +type = "::FlexFlow::PerDeviceFFHandle" + +[[fields]] +name = "inputLHSTensor" +type = "ffTensorDescriptor_t" + +[[fields]] +name = "inputRHSTensor" +type = "ffTensorDescriptor_t" + +[[fields]] +name = "outputTensor" +type = "ffTensorDescriptor_t" + +[[fields]] +name = "opDesc" +type = "ffOpTensorDescriptor_t" + +[[fields]] +name = "reduceAddDesc" +type = "ffReduceTensorDescriptor_t" diff --git a/lib/kernels/include/kernels/element_unary_kernels.h b/lib/kernels/include/kernels/element_unary_kernels.h index 0257b3b4a6..a3fb3a1ae0 100644 --- a/lib/kernels/include/kernels/element_unary_kernels.h +++ b/lib/kernels/include/kernels/element_unary_kernels.h @@ -2,46 +2,42 @@ #define _FLEXFLOW_OPS_KERNELS_ELEMENT_UNARY_KERNELS_H #include "kernels/accessor.h" -#include "kernels/device.h" +#include "kernels/device_handle_t.dtg.h" +#include "kernels/device_stream_t.dtg.h" +#include "kernels/element_unary_per_device_state.dtg.h" #include "kernels/ff_handle.h" -#include "op-attrs/ops/element_unary.h" -#include - -namespace FlexFlow { - -struct ElementUnaryPerDeviceState { - ffTensorDescriptor_t inputTensor, outputTensor; - req actiDesc; -}; - -FF_VISITABLE_STRUCT_NO_EQ(ElementUnaryPerDeviceState, - inputTensor, - outputTensor, - actiDesc); - -namespace Kernels::ElementUnary { - -ElementUnaryPerDeviceState init_kernel(ArrayShape const &input_shape, - ArrayShape const &output_shape, - ElementUnaryAttrs const &attrs); - -void forward_kernel(ffStream_t stream, - ElementUnaryPerDeviceState const &device_state, - ElementUnaryAttrs const &attrs, - PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output); - -void backward_kernel(ffStream_t stream, - ElementUnaryPerDeviceState const &device_state, - ElementUnaryAttrs const &attrs, - PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &output, - GenericTensorAccessorR const &output_grad, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &input_grad); - -} // namespace Kernels::ElementUnary -} // namespace FlexFlow +#include "op-attrs/ops/element_unary_attrs.dtg.h" + +namespace FlexFlow::Kernels::ElementUnary { + +std::optional + init_kernel(DeviceType device_type, + TensorShape const &input_shape, + TensorShape const &output_shape, + ElementUnaryAttrs const &attrs); + +void forward_kernel( + device_stream_t const &stream, + std::optional const &device_state, + ElementUnaryAttrs const &attrs, + device_handle_t const &handle, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void backward_kernel( + device_stream_t const &stream, + std::optional const &device_state, + ElementUnaryAttrs const &attrs, + device_handle_t const &handle, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad); + +void cleanup_kernel( + DeviceType device_type, + std::optional &per_device_state); + +} // namespace FlexFlow::Kernels::ElementUnary #endif diff --git a/lib/kernels/include/kernels/element_unary_kernels_cpu.h b/lib/kernels/include/kernels/element_unary_kernels_cpu.h new file mode 100644 index 0000000000..55a25411a0 --- /dev/null +++ b/lib/kernels/include/kernels/element_unary_kernels_cpu.h @@ -0,0 +1,22 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ELEMENT_UNARY_KERNELS_CPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ELEMENT_UNARY_KERNELS_CPU_H + +#include "kernels/accessor.h" +#include "kernels/ff_handle.h" +#include "op-attrs/ops/element_unary_attrs.dtg.h" + +namespace FlexFlow::Kernels::ElementUnary { + +void cpu_forward_kernel(ElementUnaryAttrs const &attrs, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void cpu_backward_kernel(ElementUnaryAttrs const &attrs, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad); + +} // namespace FlexFlow::Kernels::ElementUnary + +#endif diff --git a/lib/kernels/include/kernels/element_unary_kernels_gpu.h b/lib/kernels/include/kernels/element_unary_kernels_gpu.h new file mode 100644 index 0000000000..be5eed0edc --- /dev/null +++ b/lib/kernels/include/kernels/element_unary_kernels_gpu.h @@ -0,0 +1,36 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ELEMENT_UNARY_KERNELS_GPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_ELEMENT_UNARY_KERNELS_GPU_H + +#include "kernels/accessor.h" +#include "kernels/device.h" +#include "kernels/element_unary_per_device_state.dtg.h" +#include "kernels/ff_handle.h" +#include "op-attrs/ops/element_unary_attrs.dtg.h" + +namespace FlexFlow::Kernels::ElementUnary { + +ElementUnaryPerDeviceState gpu_init_kernel(TensorShape const &input_shape, + TensorShape const &output_shape, + ElementUnaryAttrs const &attrs); + +void gpu_forward_kernel(ffStream_t stream, + ElementUnaryPerDeviceState const &per_device_state, + ElementUnaryAttrs const &attrs, + PerDeviceFFHandle const &handle, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void gpu_backward_kernel(ffStream_t stream, + ElementUnaryPerDeviceState const &per_device_state, + ElementUnaryAttrs const &attrs, + PerDeviceFFHandle const &handle, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad); + +void gpu_cleanup_kernel(ElementUnaryPerDeviceState &per_device_state); + +} // namespace FlexFlow::Kernels::ElementUnary + +#endif diff --git a/lib/kernels/include/kernels/element_unary_per_device_state.struct.toml b/lib/kernels/include/kernels/element_unary_per_device_state.struct.toml new file mode 100644 index 0000000000..019df40315 --- /dev/null +++ b/lib/kernels/include/kernels/element_unary_per_device_state.struct.toml @@ -0,0 +1,19 @@ +namespace = "FlexFlow" +name = "ElementUnaryPerDeviceState" +features = [] + +includes = [ + "kernels/device.h", +] + +[[fields]] +name = "inputTensor" +type = "ffTensorDescriptor_t" + +[[fields]] +name = "outputTensor" +type = "ffTensorDescriptor_t" + +[[fields]] +name = "actiDesc" +type = "ffActivationDescriptor_t" diff --git a/lib/kernels/include/kernels/embedding_kernels.h b/lib/kernels/include/kernels/embedding_kernels.h index f51a730314..e9c158598a 100644 --- a/lib/kernels/include/kernels/embedding_kernels.h +++ b/lib/kernels/include/kernels/embedding_kernels.h @@ -2,11 +2,12 @@ #define _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H #include "kernels/accessor.h" -#include "kernels/device.h" -#include "op-attrs/ops/embedding.h" +#include "kernels/device_stream_t.dtg.h" +#include "op-attrs/ops/embedding_attrs.dtg.h" namespace FlexFlow::Kernels::Embedding { -void forward_kernel(ffStream_t stream, + +void forward_kernel(device_stream_t const &stream, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output, GenericTensorAccessorR const &weight, @@ -16,7 +17,8 @@ void forward_kernel(ffStream_t stream, int in_dim, int out_dim, int batch_size); -void backward_kernel(ffStream_t stream, + +void backward_kernel(device_stream_t const &stream, GenericTensorAccessorR const &output, GenericTensorAccessorR const &input, GenericTensorAccessorW const &weight_grad, @@ -27,12 +29,6 @@ void backward_kernel(ffStream_t stream, int out_dim, int batch_size); -void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p); -void rand_generate_int32_wrapper(int32_t *ptr, size_t size, int32_t p); - -template -__global__ void rand_generate_int(TD *ptr, size_t size, TD p); - } // namespace FlexFlow::Kernels::Embedding #endif // _FLEXFLOW_OPS_KERNELS_EMBEDDING_KERNELS_H diff --git a/lib/kernels/include/kernels/embedding_kernels_cpu.h b/lib/kernels/include/kernels/embedding_kernels_cpu.h new file mode 100644 index 0000000000..23e32589ae --- /dev/null +++ b/lib/kernels/include/kernels/embedding_kernels_cpu.h @@ -0,0 +1,31 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_EMBEDDING_KERNELS_CPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_EMBEDDING_KERNELS_CPU_H + +#include "kernels/accessor.h" +#include "op-attrs/ops/embedding_attrs.dtg.h" + +namespace FlexFlow::Kernels::Embedding { + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + DataType input_data_type, + DataType output_data_type, + std::optional aggr, + int in_dim, + int out_dim, + int batch_size); + +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + DataType output_data_type, + DataType input_data_type, + std::optional aggr, + int in_dim, + int out_dim, + int batch_size); + +} // namespace FlexFlow::Kernels::Embedding + +#endif diff --git a/lib/kernels/include/kernels/embedding_kernels_gpu.h b/lib/kernels/include/kernels/embedding_kernels_gpu.h new file mode 100644 index 0000000000..7eace3971b --- /dev/null +++ b/lib/kernels/include/kernels/embedding_kernels_gpu.h @@ -0,0 +1,33 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_EMBEDDING_KERNELS_GPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_EMBEDDING_KERNELS_GPU_H + +#include "kernels/accessor.h" +#include "kernels/device.h" +#include "op-attrs/ops/embedding.h" + +namespace FlexFlow::Kernels::Embedding { + +void gpu_forward_kernel(ffStream_t stream, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + DataType input_data_type, + DataType output_data_type, + std::optional aggr, + int in_dim, + int out_dim, + int batch_size); +void gpu_backward_kernel(ffStream_t stream, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + DataType output_data_type, + DataType input_data_type, + std::optional aggr, + int in_dim, + int out_dim, + int batch_size); + +} // namespace FlexFlow::Kernels::Embedding + +#endif diff --git a/lib/kernels/include/kernels/ff_handle.h b/lib/kernels/include/kernels/ff_handle.h index 31b3296a98..36ed58d91d 100644 --- a/lib/kernels/include/kernels/ff_handle.h +++ b/lib/kernels/include/kernels/ff_handle.h @@ -1,16 +1,16 @@ -#ifndef _FLEXFLOW_KERNELS_FF_HANDLE_H -#define _FLEXFLOW_KERNELS_FF_HANDLE_H +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_FF_HANDLE_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_FF_HANDLE_H #ifdef FF_USE_NCCL #include #endif #include "kernels/device.h" -#include "utils/visitable.h" namespace FlexFlow { struct PerDeviceFFHandle { +public: ffHandle_t dnn; ffblasHandle_t blas; @@ -23,23 +23,6 @@ struct PerDeviceFFHandle { #endif }; -#ifdef FF_USE_NCCL -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(PerDeviceFFHandle, - dnn, - blas, - workSpace, - workSpaceSize, - allowTensorOpMathConversion, - ncclComm); -#else -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(PerDeviceFFHandle, - dnn, - blas, - workSpace, - workSpaceSize, - allowTensorOpMathConversion); -#endif - std::string format_as(PerDeviceFFHandle const &x); std::ostream &operator<<(std::ostream &s, PerDeviceFFHandle const &x); diff --git a/lib/kernels/include/kernels/fill_tensor_accessor.h b/lib/kernels/include/kernels/fill_tensor_accessor.h index b10345933f..0e3cfd0dd5 100644 --- a/lib/kernels/include/kernels/fill_tensor_accessor.h +++ b/lib/kernels/include/kernels/fill_tensor_accessor.h @@ -7,7 +7,7 @@ namespace FlexFlow { -void fill_tensor_accessor(GenericTensorAccessorW &, DataTypeValue val); +void fill_with_zeros(GenericTensorAccessorW const &accessor); GenericTensorAccessorW create_accessor_w_filled_with( TensorShape const &shape, DataTypeValue val, Allocator const &allocator); diff --git a/lib/kernels/include/kernels/flat_kernels.h b/lib/kernels/include/kernels/flat_kernels.h index b2b1164f92..caf04ec125 100644 --- a/lib/kernels/include/kernels/flat_kernels.h +++ b/lib/kernels/include/kernels/flat_kernels.h @@ -2,16 +2,16 @@ #define _FLEXFLOW_OPS_KERNELS_FLAT_KERNELS_H #include "kernels/accessor.h" -#include "kernels/device.h" +#include "kernels/device_stream_t.dtg.h" namespace FlexFlow::Kernels::Flat { -void forward_kernel(ffStream_t stream, - GenericTensorAccessorR input, +void forward_kernel(device_stream_t const &stream, + GenericTensorAccessorR const &input, float *output_ptr); -void backward_kernel(ffStream_t stream, - GenericTensorAccessorR input, +void backward_kernel(device_stream_t const &stream, + GenericTensorAccessorR const &input, float const *output_grad_ptr, float *input_grad_ptr); diff --git a/lib/kernels/include/kernels/flat_kernels_cpu.h b/lib/kernels/include/kernels/flat_kernels_cpu.h new file mode 100644 index 0000000000..2fe43b0927 --- /dev/null +++ b/lib/kernels/include/kernels/flat_kernels_cpu.h @@ -0,0 +1,16 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_FLAT_KERNELS_CPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_FLAT_KERNELS_CPU_H + +#include "kernels/accessor.h" + +namespace FlexFlow::Kernels::Flat { + +void cpu_forward_kernel(GenericTensorAccessorR const &input, float *output_ptr); + +void cpu_backward_kernel(GenericTensorAccessorR const &input, + float const *output_grad_ptr, + float *input_grad_ptr); + +} // namespace FlexFlow::Kernels::Flat + +#endif diff --git a/lib/kernels/include/kernels/flat_kernels_gpu.h b/lib/kernels/include/kernels/flat_kernels_gpu.h new file mode 100644 index 0000000000..4e889c561c --- /dev/null +++ b/lib/kernels/include/kernels/flat_kernels_gpu.h @@ -0,0 +1,20 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_FLAT_KERNELS_GPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_FLAT_KERNELS_GPU_H + +#include "kernels/accessor.h" +#include "kernels/device.h" + +namespace FlexFlow::Kernels::Flat { + +void gpu_forward_kernel(ffStream_t stream, + GenericTensorAccessorR const &input, + float *output_ptr); + +void gpu_backward_kernel(ffStream_t stream, + GenericTensorAccessorR const &input, + float const *output_grad_ptr, + float *input_grad_ptr); + +} // namespace FlexFlow::Kernels::Flat + +#endif diff --git a/lib/kernels/include/kernels/gather_kernels.h b/lib/kernels/include/kernels/gather_kernels.h index 8cbc7e457e..66c79ab7ac 100644 --- a/lib/kernels/include/kernels/gather_kernels.h +++ b/lib/kernels/include/kernels/gather_kernels.h @@ -1,35 +1,30 @@ #ifndef _FLEXFLOW_OPS_KERNELS_GATHER_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_GATHER_KERNELS_H -#include "accessor.h" -#include "kernels/device.h" +#include "kernels/accessor.h" +#include "kernels/device_handle_t.dtg.h" +#include "kernels/device_stream_t.dtg.h" +#include "kernels/gather_per_device_state.dtg.h" -namespace FlexFlow { +namespace FlexFlow::Kernels::Gather { -struct GatherPerDeviceState { - PerDeviceFFHandle handle; - legion_dim_t legion_dim; -}; +std::optional init_kernel(DeviceType device_type, + device_handle_t const &handle, + ff_dim_t dim); -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GatherPerDeviceState, - handle, - legion_dim); - -namespace Kernels::Gather { - -void forward_kernel(ffStream_t stream, - GatherPerDeviceState const &per_device_state, +void forward_kernel(device_stream_t const &stream, + std::optional const &per_device_state, GenericTensorAccessorR const &input, GenericTensorAccessorR const &index, GenericTensorAccessorW const &output); -void backward_kernel(ffStream_t stream, - GatherPerDeviceState const &per_device_state, - GenericTensorAccessorR const &output_grad, - GenericTensorAccessorR const &index, - GenericTensorAccessorW const &input_grad); +void backward_kernel( + device_stream_t const &stream, + std::optional const &per_device_state, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &index, + GenericTensorAccessorW const &input_grad); -} // namespace Kernels::Gather -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Gather #endif diff --git a/lib/kernels/include/kernels/gather_kernels_cpu.h b/lib/kernels/include/kernels/gather_kernels_cpu.h new file mode 100644 index 0000000000..74e8f35d9f --- /dev/null +++ b/lib/kernels/include/kernels/gather_kernels_cpu.h @@ -0,0 +1,18 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_GATHER_KERNELS_CPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_GATHER_KERNELS_CPU_H + +#include "kernels/accessor.h" + +namespace FlexFlow::Kernels::Gather { + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorR const &index, + GenericTensorAccessorW const &output); + +void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &index, + GenericTensorAccessorW const &input_grad); + +} // namespace FlexFlow::Kernels::Gather + +#endif diff --git a/lib/kernels/include/kernels/gather_kernels_gpu.h b/lib/kernels/include/kernels/gather_kernels_gpu.h new file mode 100644 index 0000000000..da0866dbca --- /dev/null +++ b/lib/kernels/include/kernels/gather_kernels_gpu.h @@ -0,0 +1,27 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_GATHER_KERNELS_GPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_GATHER_KERNELS_GPU_H + +#include "kernels/accessor.h" +#include "kernels/device.h" +#include "kernels/gather_per_device_state.dtg.h" + +namespace FlexFlow::Kernels::Gather { + +GatherPerDeviceState gpu_init_kernel(PerDeviceFFHandle const &handle, + ff_dim_t dim); + +void gpu_forward_kernel(ffStream_t stream, + GatherPerDeviceState const &per_device_state, + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &index, + GenericTensorAccessorW const &output); + +void gpu_backward_kernel(ffStream_t stream, + GatherPerDeviceState const &per_device_state, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &index, + GenericTensorAccessorW const &input_grad); + +} // namespace FlexFlow::Kernels::Gather + +#endif diff --git a/lib/kernels/include/kernels/gather_per_device_state.struct.toml b/lib/kernels/include/kernels/gather_per_device_state.struct.toml new file mode 100644 index 0000000000..c5163f0ddc --- /dev/null +++ b/lib/kernels/include/kernels/gather_per_device_state.struct.toml @@ -0,0 +1,16 @@ +namespace = "FlexFlow" +name = "GatherPerDeviceState" +features = [] + +includes = [ + "kernels/ff_handle.h", + "op-attrs/ff_dim_t.dtg.h", +] + +[[fields]] +name = "handle" +type = "::FlexFlow::PerDeviceFFHandle" + +[[fields]] +name = "dim" +type = "::FlexFlow::ff_dim_t" diff --git a/lib/kernels/include/kernels/layer_norm_kernels.h b/lib/kernels/include/kernels/layer_norm_kernels.h index 10cf2fb14b..7d59e323ba 100644 --- a/lib/kernels/include/kernels/layer_norm_kernels.h +++ b/lib/kernels/include/kernels/layer_norm_kernels.h @@ -2,61 +2,44 @@ #define _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H #include "kernels/allocation.h" -#include "kernels/device.h" +#include "kernels/device_handle_t.dtg.h" +#include "kernels/device_stream_t.dtg.h" #include "kernels/ff_handle.h" - -namespace FlexFlow { - -struct LayerNormPerDeviceState { - PerDeviceFFHandle handle; - bool elementwise_affine; - int64_t effective_batch_size, effective_num_elements; - float eps; - float *mean, *rstd, *ds, *db, *scale, *bias; - DataType data_type; -}; - -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LayerNormPerDeviceState, - handle, - elementwise_affine, - effective_batch_size, - effective_num_elements, - eps, - mean, - rstd, - ds, - db, - scale, - bias, - data_type); - -namespace Kernels::LayerNorm { - -// todo: this may have some problem. -LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const &handle, - Allocator &allocator, - bool elementwise_affine, - int64_t effective_batch_size, - int64_t effective_num_elements, - float eps); - -void forward_kernel(ffStream_t stream, - LayerNormPerDeviceState const &m, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - GenericTensorAccessorW const &gamma, - GenericTensorAccessorW const &beta); - -void backward_kernel(ffStream_t stream, - LayerNormPerDeviceState const &m, - GenericTensorAccessorR const &output_grad, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &input_grad, - GenericTensorAccessorR const &gamma, - GenericTensorAccessorW const &gamma_grad, - GenericTensorAccessorW const &beta_grad); - -} // namespace Kernels::LayerNorm -} // namespace FlexFlow +#include "kernels/layer_norm_per_device_state.dtg.h" + +namespace FlexFlow::Kernels::LayerNorm { + +std::optional + init_kernel(DeviceType device_type, + device_handle_t const &handle, + Allocator &allocator, + bool elementwise_affine, + int64_t effective_batch_size, + int64_t effective_num_elements, + float eps); + +void forward_kernel( + device_stream_t const &stream, + std::optional const &per_device_state, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorW const &gamma, + GenericTensorAccessorW const &beta); + +void backward_kernel( + device_stream_t const &stream, + std::optional const &per_device_state, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad); + +void cleanup_kernel( + DeviceType device_type, + std::optional const &per_device_state); + +} // namespace FlexFlow::Kernels::LayerNorm #endif // _FLEXFLOW_OPS_KERNELS_LAYER_NORM_KERNELS_H diff --git a/lib/kernels/include/kernels/layer_norm_kernels_cpu.h b/lib/kernels/include/kernels/layer_norm_kernels_cpu.h new file mode 100644 index 0000000000..74239a36eb --- /dev/null +++ b/lib/kernels/include/kernels/layer_norm_kernels_cpu.h @@ -0,0 +1,22 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LAYER_NORM_KERNELS_CPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LAYER_NORM_KERNELS_CPU_H + +#include "kernels/accessor.h" + +namespace FlexFlow::Kernels::LayerNorm { + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorW const &gamma, + GenericTensorAccessorW const &beta); + +void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad); + +} // namespace FlexFlow::Kernels::LayerNorm + +#endif diff --git a/lib/kernels/include/kernels/layer_norm_kernels_gpu.h b/lib/kernels/include/kernels/layer_norm_kernels_gpu.h new file mode 100644 index 0000000000..ccf6d3027c --- /dev/null +++ b/lib/kernels/include/kernels/layer_norm_kernels_gpu.h @@ -0,0 +1,39 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LAYER_NORM_KERNELS_GPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LAYER_NORM_KERNELS_GPU_H + +#include "kernels/allocation.h" +#include "kernels/device.h" +#include "kernels/ff_handle.h" +#include "kernels/layer_norm_per_device_state.dtg.h" + +namespace FlexFlow::Kernels::LayerNorm { + +// todo: this may have some problem. +LayerNormPerDeviceState gpu_init_kernel(PerDeviceFFHandle const &handle, + Allocator &allocator, + bool elementwise_affine, + int64_t effective_batch_size, + int64_t effective_num_elements, + float eps); + +void gpu_forward_kernel(ffStream_t stream, + LayerNormPerDeviceState const &per_device_state, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorW const &gamma, + GenericTensorAccessorW const &beta); + +void gpu_backward_kernel(ffStream_t stream, + LayerNormPerDeviceState const &per_device_state, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad); + +void gpu_cleanup_kernel(LayerNormPerDeviceState const &per_device_state); + +} // namespace FlexFlow::Kernels::LayerNorm + +#endif diff --git a/lib/kernels/include/kernels/layer_norm_per_device_state.struct.toml b/lib/kernels/include/kernels/layer_norm_per_device_state.struct.toml new file mode 100644 index 0000000000..0a482d5395 --- /dev/null +++ b/lib/kernels/include/kernels/layer_norm_per_device_state.struct.toml @@ -0,0 +1,57 @@ +namespace = "FlexFlow" +name = "LayerNormPerDeviceState" +features = [] + +includes = [ + "kernels/ff_handle.h", + "op-attrs/datatype.dtg.h", +] + +[[fields]] +name = "handle" +type = "::FlexFlow::PerDeviceFFHandle" + +[[fields]] +name = "elementwise_affine" +type = "bool" + +[[fields]] +name = "effective_num_elements" +type = "int64_t" + +[[fields]] +name = "effective_batch_size" +type = "int64_t" + +[[fields]] +name = "eps" +type = "float" + +[[fields]] +name = "mean" +type = "float *" + +[[fields]] +name = "rstd" +type = "float *" + +[[fields]] +name = "ds" +type = "float *" + +[[fields]] +name = "db" +type = "float *" + +[[fields]] +name = "scale" +type = "float *" + +[[fields]] +name = "bias" +type = "float *" + +[[fields]] +name = "data_type" +type = "::FlexFlow::DataType" + diff --git a/lib/kernels/include/kernels/legion_dim.h b/lib/kernels/include/kernels/legion_dim.h index 796423102b..24eff46e22 100644 --- a/lib/kernels/include/kernels/legion_dim.h +++ b/lib/kernels/include/kernels/legion_dim.h @@ -5,6 +5,7 @@ #include "kernels/legion_ordered/legion_ordered.h" #include "op-attrs/ff_dim_t.dtg.h" #include "op-attrs/ff_ordered/ff_ordered.h" +#include "op-attrs/tensor_dims.dtg.h" #include "utils/containers/set_of.h" #include "utils/containers/transform.h" #include "utils/nonnegative_int/nonnegative_range.h" @@ -13,6 +14,9 @@ namespace FlexFlow { +positive_int dim_at_idx(TensorDims const &, legion_dim_t); +positive_int &dim_at_idx(TensorDims &, legion_dim_t); + legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value); legion_dim_t legion_dim_from_ff_dim(ff_dim_t, nonnegative_int num_dimensions); diff --git a/lib/kernels/include/kernels/legion_ordered/legion_ordered.h b/lib/kernels/include/kernels/legion_ordered/legion_ordered.h index ad8b3bad6d..87836fb31e 100644 --- a/lib/kernels/include/kernels/legion_ordered/legion_ordered.h +++ b/lib/kernels/include/kernels/legion_ordered/legion_ordered.h @@ -11,17 +11,17 @@ template struct LegionOrdered { LegionOrdered() {} - LegionOrdered(std::initializer_list const &l) + explicit LegionOrdered(std::initializer_list const &l) : contents(l.begin(), l.end()) {} - LegionOrdered(std::vector const &contents) + explicit LegionOrdered(std::vector const &contents) : contents(contents.begin(), contents.end()) {} template - LegionOrdered(It begin, It end) : contents(begin, end) {} + explicit LegionOrdered(It begin, It end) : contents(begin, end) {} template - LegionOrdered(stack_vector const &contents) + explicit LegionOrdered(stack_vector const &contents) : contents(contents.begin(), contents.end()) {} T const &at(legion_dim_t idx) const { diff --git a/lib/kernels/include/kernels/linear_kernels.h b/lib/kernels/include/kernels/linear_kernels.h index 21d84c2567..0b6371c766 100644 --- a/lib/kernels/include/kernels/linear_kernels.h +++ b/lib/kernels/include/kernels/linear_kernels.h @@ -1,77 +1,54 @@ #ifndef _FLEXFLOW_OPS_KERNELS_LINEAR_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_LINEAR_KERNELS_H -#include "ff_handle.h" -#include "kernels/device.h" +#include "kernels/accessor.h" +#include "kernels/device_handle_t.dtg.h" +#include "kernels/device_stream_t.dtg.h" +#include "kernels/ff_handle.h" +#include "kernels/linear_per_device_state.dtg.h" #include "op-attrs/datatype.h" #include "op-attrs/ops/linear_attrs.dtg.h" +#include "pcg/device_type.dtg.h" namespace FlexFlow { -struct LinearPerDeviceState { - PerDeviceFFHandle handle; - ffTensorDescriptor_t outputTensor; - ffActivationDescriptor_t actiDesc; - float const *one_ptr; // how to handle this? - cudnnActivationMode_t activation_mode; - std::optional activation; - std::optional regularizer; - bool use_bias; - DataType input_type, weight_type, output_type; -}; +std::optional + linear_init_kernel(DeviceType device_type, + device_handle_t const &handle, + std::optional activation, + std::optional regularizer, + bool use_bias, + DataType input_type, + DataType weight_type, + DataType output_type, + int batch_size, + int output_num_channels); + +void linear_forward_kernel( + device_stream_t const &stream, + std::optional const &per_device_state, + LinearAttrs const &attrs, + GenericTensorAccessorR const &input_accessor, + GenericTensorAccessorW const &output_accessor, + GenericTensorAccessorR const &projection_accessor, + std::optional const &bias_accessor); + +void linear_backward_kernel( + device_stream_t const &stream, + std::optional const &per_device_state, + LinearAttrs const &attrs, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &projection, + GenericTensorAccessorW const &projection_grad, + std::optional const &bias_grad); + +void linear_cleanup_kernel( + DeviceType device_type, + std::optional &per_device_state); -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(LinearPerDeviceState, - handle, - outputTensor, - actiDesc, - one_ptr, - activation_mode, - activation, - regularizer, - use_bias, - input_type, - weight_type, - output_type); - -namespace Kernels::Linear { - -LinearPerDeviceState init_kernel(PerDeviceFFHandle handle, - float *one_ptr, - std::optional activation, - std::optional regularizer, - bool use_bias, - DataType input_type, - DataType weight_type, - DataType output_type, - int batch_size, - int channel); - -bool use_activation(Activation activation); - -void forward_kernel(ffStream_t stream, - LinearPerDeviceState const &m, - float const *input_ptr, - float *output_ptr, - float const *filter_ptr, - float const *bias_ptr, - int in_dim, - int out_dim, - int batch_size); - -void backward_kernel(ffStream_t stream, - LinearPerDeviceState const &m, - float const *output_ptr, - float *output_grad_ptr, - float const *input_ptr, - float *input_grad_ptr, - float const *kernel_ptr, - float *kernel_grad_ptr, - float *bias_grad_ptr, - int in_dim, - int out_dim, - int batch_size); - -} // namespace Kernels::Linear } // namespace FlexFlow #endif diff --git a/lib/kernels/include/kernels/linear_kernels_cpu.h b/lib/kernels/include/kernels/linear_kernels_cpu.h new file mode 100644 index 0000000000..4621f38d7f --- /dev/null +++ b/lib/kernels/include/kernels/linear_kernels_cpu.h @@ -0,0 +1,29 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LINEAR_KERNELS_CPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LINEAR_KERNELS_CPU_H + +#include "kernels/accessor.h" +#include "op-attrs/ops/linear_attrs.dtg.h" +#include + +namespace FlexFlow { + +void linear_cpu_forward_kernel( + LinearAttrs const &attrs, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &projection, + std::optional const &bias); + +void linear_cpu_backward_kernel( + LinearAttrs const &attrs, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &projection, + GenericTensorAccessorW const &projection_grad, + std::optional const &bias_grad); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/linear_kernels_gpu.h b/lib/kernels/include/kernels/linear_kernels_gpu.h new file mode 100644 index 0000000000..02fac75c25 --- /dev/null +++ b/lib/kernels/include/kernels/linear_kernels_gpu.h @@ -0,0 +1,49 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LINEAR_KERNELS_GPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LINEAR_KERNELS_GPU_H + +#include "kernels/device.h" +#include "kernels/ff_handle.h" +#include "kernels/linear_per_device_state.dtg.h" +#include "pcg/device_type.dtg.h" + +namespace FlexFlow::Kernels::Linear { + +LinearPerDeviceState + gpu_init_kernel(PerDeviceFFHandle handle, + std::optional activation, + std::optional regularizer, + bool use_bias, + DataType input_type, + DataType weight_type, + DataType output_type, + int batch_size, + int output_num_channels); + +void gpu_forward_kernel(ffStream_t stream, + LinearPerDeviceState const &per_device_state, + float const *input_ptr, + float *output_ptr, + float const *filter_ptr, + float const *bias_ptr, + int in_dim, + int out_dim, + int batch_size); + +void gpu_backward_kernel(ffStream_t stream, + LinearPerDeviceState const &per_device_state, + float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, + float const *kernel_ptr, + float *kernel_grad_ptr, + float *bias_grad_ptr, + int in_dim, + int out_dim, + int batch_size); + +void gpu_cleanup_kernel(LinearPerDeviceState &per_device_state); + +} // namespace FlexFlow::Kernels::Linear + +#endif diff --git a/lib/kernels/include/kernels/linear_per_device_state.struct.toml b/lib/kernels/include/kernels/linear_per_device_state.struct.toml new file mode 100644 index 0000000000..3ed534a23f --- /dev/null +++ b/lib/kernels/include/kernels/linear_per_device_state.struct.toml @@ -0,0 +1,56 @@ +namespace = "FlexFlow" +name = "LinearPerDeviceState" +features = [] + +includes = [ + "kernels/ff_handle.h", + "kernels/device.h", + "", + "op-attrs/activation.dtg.h", + "op-attrs/regularizer_attrs.dtg.h", + "op-attrs/datatype.dtg.h", +] + +[[fields]] +name = "handle" +type = "::FlexFlow::PerDeviceFFHandle" + +[[fields]] +name = "outputTensor" +type = "ffTensorDescriptor_t" + +[[fields]] +name = "actiDesc" +type = "ffActivationDescriptor_t" + +[[fields]] +name = "one_ptr" +type = "float const *" + +[[fields]] +name = "activation_mode" +type = "cudnnActivationMode_t" + +[[fields]] +name = "activation" +type = "std::optional<::FlexFlow::Activation>" + +[[fields]] +name = "regularizer" +type = "std::optional<::FlexFlow::RegularizerAttrs>" + +[[fields]] +name = "use_bias" +type = "bool" + +[[fields]] +name = "input_type" +type = "::FlexFlow::DataType" + +[[fields]] +name = "weight_type" +type = "::FlexFlow::DataType" + +[[fields]] +name = "output_type" +type = "::FlexFlow::DataType" diff --git a/lib/kernels/include/kernels/loss_function_kernels.h b/lib/kernels/include/kernels/loss_function_kernels.h index bab404f884..092d3691f5 100644 --- a/lib/kernels/include/kernels/loss_function_kernels.h +++ b/lib/kernels/include/kernels/loss_function_kernels.h @@ -1,12 +1,13 @@ #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_H #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_H -#include "kernels/device.h" +#include "kernels/accessor.h" +#include "kernels/device_stream_t.dtg.h" namespace FlexFlow { void sparse_categorical_crossentropy_loss_backward_kernel( - ffStream_t stream, + device_stream_t const &stream, float *logit_grad_ptr, float const *logit_ptr, int const *label_ptr, @@ -16,21 +17,23 @@ void sparse_categorical_crossentropy_loss_backward_kernel( int num_classes, int k, float scale_factor); -void categorical_crossentropy_loss_backward_kernel(ffStream_t stream, - float *logit_grad_ptr, - float const *logit_ptr, - float const *label_ptr, - size_t logit_volume, - size_t logit_grad_volume, - float scale_factor); -void mean_squared_error_avg_loss_backward_kernel(ffStream_t stream, + +void categorical_crossentropy_loss_backward_kernel( + device_stream_t const &stream, + GenericTensorAccessorW const &logit_grad, + GenericTensorAccessorR const &logit, + GenericTensorAccessorR const &label, + float scale_factor); + +void mean_squared_error_avg_loss_backward_kernel(device_stream_t const &stream, float *logit_grad_ptr, float const *logit_ptr, float const *label_ptr, size_t logit_volume, size_t logit_grad_volume, float scale_factor); -void identity_loss_backward_kernel(ffStream_t stream, + +void identity_loss_backward_kernel(device_stream_t const &stream, float *loss_grad_ptr, float const *loss_ptr, size_t loss_volume, diff --git a/lib/kernels/include/kernels/loss_function_kernels_cpu.h b/lib/kernels/include/kernels/loss_function_kernels_cpu.h new file mode 100644 index 0000000000..b6abd01ab3 --- /dev/null +++ b/lib/kernels/include/kernels/loss_function_kernels_cpu.h @@ -0,0 +1,41 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_CPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_CPU_H + +#include "kernels/accessor.h" +#include + +namespace FlexFlow { + +void sparse_categorical_crossentropy_loss_backward_cpu_kernel( + float *logit_grad_ptr, + float const *logit_ptr, + int const *label_ptr, + size_t logit_volume, + size_t logit_grad_volume, + int num_samples, + int num_classes, + int k, + float scale_factor); + +void categorical_crossentropy_loss_backward_cpu_kernel( + GenericTensorAccessorW const &logit_grad_ptr, + GenericTensorAccessorR const &logit_ptr, + GenericTensorAccessorR const &label_ptr, + float scale_factor); + +void mean_squared_error_avg_loss_backward_cpu_kernel(float *logit_grad_ptr, + float const *logit_ptr, + float const *label_ptr, + size_t logit_volume, + size_t logit_grad_volume, + float scale_factor); + +void identity_loss_backward_cpu_kernel(float *loss_grad_ptr, + float const *loss_ptr, + size_t loss_volume, + size_t loss_grad_volume, + float csale_factor); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/loss_function_kernels_gpu.h b/lib/kernels/include/kernels/loss_function_kernels_gpu.h new file mode 100644 index 0000000000..7bda92531f --- /dev/null +++ b/lib/kernels/include/kernels/loss_function_kernels_gpu.h @@ -0,0 +1,45 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_GPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_LOSS_FUNCTION_KERNELS_GPU_H + +#include "kernels/device.h" + +namespace FlexFlow { + +void sparse_categorical_crossentropy_loss_backward_gpu_kernel( + ffStream_t stream, + float *logit_grad_ptr, + float const *logit_ptr, + int const *label_ptr, + size_t logit_volume, + size_t logit_grad_volume, + int num_samples, + int num_classes, + int k, + float scale_factor); + +void categorical_crossentropy_loss_backward_gpu_kernel(ffStream_t stream, + float *logit_grad_ptr, + float const *logit_ptr, + float const *label_ptr, + size_t logit_volume, + size_t logit_grad_volume, + float scale_factor); + +void mean_squared_error_avg_loss_backward_gpu_kernel(ffStream_t stream, + float *logit_grad_ptr, + float const *logit_ptr, + float const *label_ptr, + size_t logit_volume, + size_t logit_grad_volume, + float scale_factor); + +void identity_loss_backward_gpu_kernel(ffStream_t stream, + float *loss_grad_ptr, + float const *loss_ptr, + size_t loss_volume, + size_t loss_grad_volume, + float csale_factor); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/managed_per_device_ff_handle.h b/lib/kernels/include/kernels/managed_per_device_ff_handle.h index 287369a202..e80e3e4b15 100644 --- a/lib/kernels/include/kernels/managed_per_device_ff_handle.h +++ b/lib/kernels/include/kernels/managed_per_device_ff_handle.h @@ -2,6 +2,8 @@ #define _FLEXFLOW_KERNELS_MANAGED_HANDLE_H #include "kernels/ff_handle.h" +#include "pcg/device_type.dtg.h" +#include namespace FlexFlow { @@ -33,6 +35,11 @@ struct ManagedPerDeviceFFHandle { PerDeviceFFHandle *handle; }; +std::optional + create_local_handle_for_device_type(DeviceType device_type, + size_t workSpaceSize, + bool allowTensorOpMathConversion); + ManagedPerDeviceFFHandle initialize_single_gpu_handle(size_t workSpaceSize, bool allowTensorOpMathConversion); diff --git a/lib/kernels/include/kernels/map_tensor_accessors.h b/lib/kernels/include/kernels/map_tensor_accessors.h index 2933a611cf..f7aa6a1001 100644 --- a/lib/kernels/include/kernels/map_tensor_accessors.h +++ b/lib/kernels/include/kernels/map_tensor_accessors.h @@ -15,57 +15,78 @@ namespace FlexFlow { template struct CPUMapTensorAccessorInPlace { template - void operator()(GenericTensorAccessorW &accessor, F &&f) { + void operator()(GenericTensorAccessorW const &accessor, F &&f) { ASSERT(accessor.device_type == DeviceType::CPU); - for (ArrayCoord const &coord : get_array_coord_set(accessor.shape)) { - accessor.at
(coord.ff_ordered) = f(accessor.at
(coord.ff_ordered)); + for (TensorDimsCoord const &coord : + get_tensor_dims_coord_set(accessor.shape.dims)) { + accessor.at
(coord) = f(accessor.at
(coord)); } } }; template -void map_tensor_accessor_inplace(GenericTensorAccessorW &accessor, F &&f) { +void map_tensor_accessor_inplace(GenericTensorAccessorW const &accessor, + F &&f) { ASSERT(accessor.device_type == DeviceType::CPU); DataTypeDispatch1{}( - accessor.data_type, accessor, f); + accessor.shape.data_type, accessor, f); } template struct CPUMapTensorAccessor { template void operator()(GenericTensorAccessorR const &input, - GenericTensorAccessorW &output, + GenericTensorAccessorW const &output, F &&f) { - ArrayShape shape = require_same(input.shape, output.shape); + TensorDims tensor_dims = require_same(input.shape.dims, output.shape.dims); ASSERT(input.device_type == DeviceType::CPU); ASSERT(output.device_type == DeviceType::CPU); - for (ArrayCoord const &coord : get_array_coord_set(shape)) { + for (TensorDimsCoord const &coord : + get_tensor_dims_coord_set(tensor_dims)) { output.at< type_to_data_type_enum_v>>>( - coord.ff_ordered) = f(input.at
(coord.ff_ordered)); + coord) = f(input.at
(coord)); } } }; template > -GenericTensorAccessorW map_tensor_accessor(GenericTensorAccessorR const &input, - F &&f, - Allocator &output_allocator) { +void map_tensor_accessor_to(GenericTensorAccessorR const &input, + F &&f, + GenericTensorAccessorW const &output) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); GenericTensorAccessorR input_cpu = copy_tensor_accessor_r_to_cpu_if_necessary(input, cpu_allocator); - GenericTensorAccessorW output_cpu = cpu_allocator.allocate_tensor( - get_tensor_shape(input.shape, type_to_data_type_enum_v)); + GenericTensorAccessorW output_cpu = + cpu_allocator.allocate_tensor(output.shape); DataTypeDispatch1{}( - input.data_type, input_cpu, output_cpu, f); + input.shape.data_type, input_cpu, output_cpu, f); - return copy_tensor_accessor_w(output_cpu, output_allocator); + copy_accessor_data_to_l_from_r( + output, read_only_accessor_from_write_accessor(output_cpu)); +} + +template > +GenericTensorAccessorW map_tensor_accessor(GenericTensorAccessorR const &input, + F &&f, + Allocator &output_allocator) { + TensorShape output_shape = TensorShape{ + /*dims=*/input.shape.dims, + /*data_type=*/type_to_data_type_enum_v, + }; + + GenericTensorAccessorW output = + output_allocator.allocate_tensor(output_shape); + + map_tensor_accessor_to(input, f, output); + + return output; } template @@ -78,30 +99,30 @@ struct CPUMapTensorAccessors2 { GenericTensorAccessorW &output, F &&f) { - ArrayShape shape = throw_if_unexpected(require_all_same1(std::vector{ - lhs.shape, - rhs.shape, - output.shape, + TensorDims dims = throw_if_unexpected(require_all_same1(std::vector{ + lhs.shape.dims, + rhs.shape.dims, + output.shape.dims, })); ASSERT(lhs.device_type == DeviceType::CPU); ASSERT(rhs.device_type == DeviceType::CPU); ASSERT(output.device_type == DeviceType::CPU); - for (ArrayCoord const &coord : get_array_coord_set(shape)) { - output.at>(coord.ff_ordered) = - f(lhs.at(coord.ff_ordered), rhs.at(coord.ff_ordered)); + for (TensorDimsCoord const &coord : get_tensor_dims_coord_set(dims)) { + output.at>(coord) = + f(lhs.at(coord), rhs.at(coord)); } } }; template -GenericTensorAccessorW map_tensor_accessors2(GenericTensorAccessorR const &lhs, - GenericTensorAccessorR const &rhs, - DataType output_data_type, - F &&f, - Allocator &output_allocator) { - ArrayShape shape = require_same(lhs.shape, rhs.shape); +void map_tensor_accessors2_to(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + DataType output_data_type, + F &&f, + GenericTensorAccessorW const &output) { + TensorDims output_dims = require_same(lhs.shape.dims, rhs.shape.dims); Allocator cpu_allocator = create_local_cpu_memory_allocator(); GenericTensorAccessorR lhs_cpu = @@ -109,12 +130,32 @@ GenericTensorAccessorW map_tensor_accessors2(GenericTensorAccessorR const &lhs, GenericTensorAccessorR rhs_cpu = copy_tensor_accessor_r_to_cpu_if_necessary(rhs, cpu_allocator); GenericTensorAccessorW output_cpu = - cpu_allocator.allocate_tensor(get_tensor_shape(shape, output_data_type)); + cpu_allocator.allocate_tensor(TensorShape{output_dims, output_data_type}); + + DataTypeDispatch2{}(lhs.shape.data_type, + rhs.shape.data_type, + lhs_cpu, + rhs_cpu, + output_cpu, + f); + + return copy_accessor_data_to_l_from_r(output, output_cpu); +} + +template +GenericTensorAccessorW map_tensor_accessors2(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + DataType output_data_type, + F &&f, + Allocator &output_allocator) { + TensorDims output_dims = require_same(lhs.shape.dims, rhs.shape.dims); + + GenericTensorAccessorW output = output_allocator.allocate_tensor( + TensorShape{output_dims, output_data_type}); - DataTypeDispatch2{}( - lhs.data_type, rhs.data_type, lhs_cpu, rhs_cpu, output_cpu, f); + map_tensor_accessors2_to(lhs, rhs, output_data_type, f, output); - return copy_tensor_accessor_w(output_cpu, output_allocator); + return output; } } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/mha_per_device_state.struct.toml b/lib/kernels/include/kernels/mha_per_device_state.struct.toml new file mode 100644 index 0000000000..324e8d1184 --- /dev/null +++ b/lib/kernels/include/kernels/mha_per_device_state.struct.toml @@ -0,0 +1,65 @@ +namespace = "FlexFlow" +name = "MHAPerDeviceState" +features = [] + +includes = [ + "kernels/device.h", + "kernels/ff_handle.h", + "kernels/allocation.h", +] + +[[fields]] +name = "handle" +type = "::FlexFlow::PerDeviceFFHandle" + +[[fields]] +name = "weightSize" +type = "size_t" + +[[fields]] +name = "reserveSpaceSize" +type = "size_t" + +[[fields]] +name = "attnDesc" +type = "ffAttnDescriptor_t" + +[[fields]] +name = "qDesc" +type = "ffSeqDataDescriptor_t" + +[[fields]] +name = "kDesc" +type = "ffSeqDataDescriptor_t" + +[[fields]] +name = "vDesc" +type = "ffSeqDataDescriptor_t" + +[[fields]] +name = "oDesc" +type = "ffSeqDataDescriptor_t" + +[[fields]] +name = "devQoSeqArray" +type = "int *" + +[[fields]] +name = "devKvSeqArray" +type = "int *" + +[[fields]] +name = "loWinIdx" +type = "int *" + +[[fields]] +name = "hiWinIdx" +type = "int *" + +[[fields]] +name = "reserveSpace" +type = "void *" + +[[fields]] +name = "allocator" +type = "::FlexFlow::Allocator" diff --git a/lib/kernels/include/kernels/optimizer_kernels.h b/lib/kernels/include/kernels/optimizer_kernels.h index 51e6f8640f..6bb7b913be 100644 --- a/lib/kernels/include/kernels/optimizer_kernels.h +++ b/lib/kernels/include/kernels/optimizer_kernels.h @@ -1,62 +1,37 @@ #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H -#include "kernels/device.h" +#include "kernels/accessor.h" +#include "kernels/device_handle_t.dtg.h" +#include "kernels/device_stream_t.dtg.h" #include "kernels/ff_handle.h" namespace FlexFlow { -void sgd_ps_update_task_gpu(ffStream_t, - float lr, - float momentum, - bool nesterov, - float weight_decay, - float const *weight_grad_ptr, - size_t size, - int num_replicas, - float *weight_ptr, - float *sgd_v_ptr); - -#ifdef FF_USE_NCCL -void sgd_nccl_update_task_gpu(ffStream_t, - float lr, - float momentum, - bool nesterov, - float weight_decay, - PerDeviceFFHandle const &, - float const *weight_grad_ptr, - size_t size, - float *weight_ptr, - float *sgd_v_ptr); -#endif - -void adam_ps_update_task_gpu(ffStream_t, - float alpha_t, - float beta1, - float beta2, - float weight_decay, - float epsilon, - float const *weight_grad_ptr, - size_t size, - int num_replicas, - float *weight_ptr, - float *adam_v_ptr, - float *adam_m_ptr); - -#ifdef FF_USE_NCCL -void adam_nccl_update_task_gpu(ffStream_t, - float alpha_t, - float beta1, - float beta2, - float weight_decay, - float epsilon, - PerDeviceFFHandle const &, - float const *weight_grad_ptr, - size_t size, - float *weight_ptr, - float *adam_v_ptr, - float *adam_m_ptr); -#endif +void sgd_update_task(device_stream_t const &stream, + device_handle_t const &handle, + float lr, + float momentum, + bool nesterov, + float weight_decay, + GenericTensorAccessorR const &weight_grad, + int num_replicas, + GenericTensorAccessorW const &weight, + std::optional const &sgd_v); + +void adam_update_task(device_stream_t const &stream, + device_handle_t const &handle, + float alpha_t, + float beta1, + float beta2, + float weight_decay, + float epsilon, + float const *weight_grad_ptr, + size_t size, + int num_replicas, + float *weight_ptr, + float *adam_v_ptr, + float *adam_m_ptr); } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/optimizer_kernels_cpu.h b/lib/kernels/include/kernels/optimizer_kernels_cpu.h new file mode 100644 index 0000000000..1a7943f9ca --- /dev/null +++ b/lib/kernels/include/kernels/optimizer_kernels_cpu.h @@ -0,0 +1,31 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_CPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_CPU_H + +#include "kernels/accessor.h" +#include + +namespace FlexFlow { + +void cpu_sgd_update_task(float lr, + float momentum, + bool nesterov, + float weight_decay, + GenericTensorAccessorR const &weight_grad, + GenericTensorAccessorW const &weight, + std::optional const &sgd_v); + +void cpu_adam_update_task(float alpha_t, + float beta1, + float beta2, + float weight_decay, + float epsilon, + float const *weight_grad_ptr, + size_t size, + int num_replicas, + float *weight_ptr, + float *adam_v_ptr, + float *adam_m_ptr); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/optimizer_kernels_gpu.h b/lib/kernels/include/kernels/optimizer_kernels_gpu.h new file mode 100644 index 0000000000..3e2a65a638 --- /dev/null +++ b/lib/kernels/include/kernels/optimizer_kernels_gpu.h @@ -0,0 +1,59 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_GPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_GPU_H + +#include "kernels/device.h" +#include "kernels/ff_handle.h" + +namespace FlexFlow { + +void gpu_sgd_ps_update_task(ffStream_t stream, + float lr, + float momentum, + bool nesterov, + float weight_decay, + float const *weight_grad_ptr, + size_t size, + int num_replicas, + float *weight_ptr, + float *sgd_v_ptr); + +void gpu_sgd_nccl_update_task(ffStream_t stream, + float lr, + float momentum, + bool nesterov, + float weight_decay, + PerDeviceFFHandle const &, + float const *weight_grad_ptr, + size_t size, + float *weight_ptr, + float *sgd_v_ptr); + +void gpu_adam_ps_update_task(ffStream_t stream, + float alpha_t, + float beta1, + float beta2, + float weight_decay, + float epsilon, + float const *weight_grad_ptr, + size_t size, + int num_replicas, + float *weight_ptr, + float *adam_v_ptr, + float *adam_m_ptr); + +void gpu_adam_nccl_update_task(ffStream_t stream, + float alpha_t, + float beta1, + float beta2, + float weight_decay, + float epsilon, + PerDeviceFFHandle const &handle, + float const *weight_grad_ptr, + size_t size, + float *weight_ptr, + float *adam_v_ptr, + float *adam_m_ptr); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/partition_kernels.h b/lib/kernels/include/kernels/partition_kernels.h deleted file mode 100644 index aa3a7a1ef7..0000000000 --- a/lib/kernels/include/kernels/partition_kernels.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H -#define _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H - -#include "kernels/accessor.h" -#include "kernels/device.h" - -namespace FlexFlow { - -struct RepartitionPerDeviceState { - PerDeviceFFHandle handle; - req data_type; -}; - -FF_VISITABLE_STRUCT_NO_EQ(RepartitionPerDeviceState, handle, data_type); - -namespace Kernels::Repartition { - -RepartitionPerDeviceState init_kernel(PerDeviceFFHandle const &handle, - DataType data_type); - -void forward_kernel(ffStream_t stream, - RepartitionPerDeviceState const &m, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output); - -void backward_kernel(ffStream_t stream, - RepartitionPerDeviceState const &m, - GenericTensorAccessorR const &output_grad, - GenericTensorAccessorW const &input_grad); - -} // namespace Kernels::Repartition -} // namespace FlexFlow - -#endif // _FLEXFLOW_OPS_KERNELS_PARTITION_KERNELS_H diff --git a/lib/kernels/include/kernels/partition_per_device_state.struct.toml b/lib/kernels/include/kernels/partition_per_device_state.struct.toml new file mode 100644 index 0000000000..a008e422cd --- /dev/null +++ b/lib/kernels/include/kernels/partition_per_device_state.struct.toml @@ -0,0 +1,16 @@ +namespace = "FlexFlow" +name = "RepartitionPerDeviceState" +features = [] + +includes = [ + "kernels/ff_handle.h", + "op-attrs/datatype.dtg.h", +] + +[[fields]] +name = "handle" +type = "::FlexFlow::PerDeviceFFHandle" + +[[fields]] +name = "data_type" +type = "::FlexFlow::DataType" diff --git a/lib/kernels/include/kernels/per_device_op_state.variant.toml b/lib/kernels/include/kernels/per_device_op_state.variant.toml deleted file mode 100644 index 0171e3e497..0000000000 --- a/lib/kernels/include/kernels/per_device_op_state.variant.toml +++ /dev/null @@ -1,82 +0,0 @@ -namespace = "FlexFlow" -name = "PerDeviceOpState" -features = [] - -includes = [ - "kernels/attention_kernels.h", - "kernels/batch_norm_kernels.h", - "kernels/conv_2d_kernels.h", - "kernels/dropout_kernels.h", - "kernels/element_binary_kernels.h", - "kernels/element_unary_kernels.h", - "kernels/gather_kernels.h", - "kernels/layer_norm_kernels.h", - "kernels/linear_kernels.h", - "kernels/partition_kernels.h", - "kernels/pool_2d_kernels.h", - "kernels/reduce_kernels.h", - "kernels/reduction_kernels.h", - "kernels/reshape_kernels.h", - "kernels/softmax_kernels.h", - "kernels/topk_kernels.h", -] - -[[values]] -type = "::FlexFlow::MHAPerDeviceState" -key = "mha_per_device_state" - -[[values]] -type = "::FlexFlow::BatchNormPerDeviceState" -key = "batch_norm_per_device_state" - -[[values]] -type = "::FlexFlow::Conv2DPerDeviceState" -key = "conv2d_per_device_state" - -[[values]] -type = "::FlexFlow::DropoutPerDeviceState" -key = "dropout_per_device_state" - -[[values]] -type = "::FlexFlow::ElementBinaryPerDeviceState" -key = "element_binary_per_device_state" - -[[values]] -type = "::FlexFlow::ElementUnaryPerDeviceState" -key = "element_unary_per_device_state" - -[[values]] -type = "::FlexFlow::GatherPerDeviceState" -key = "gather_per_device_state" - -[[values]] -type = "::FlexFlow::LayerNormPerDeviceState" -key = "layer_norm_per_device_state" - -[[values]] -type = "::FlexFlow::LinearPerDeviceState" -key = "linear_per_device_state" - -[[values]] -type = "::FlexFlow::Pool2DPerDeviceState" -key = "pool_2d_per_device_state" - -[[values]] -type = "::FlexFlow::ReducePerDeviceState" -key = "reduce_per_device_state" - -[[values]] -type = "::FlexFlow::RepartitionPerDeviceState" -key = "repartition_per_device_state" - -[[values]] -type = "::FlexFlow::ReshapePerDeviceState" -key = "reshape_per_device_state" - -[[values]] -type = "::FlexFlow::SoftmaxPerDeviceState" -key = "softmax_per_device_state" - -[[values]] -type = "::FlexFlow::TopKPerDeviceState" -key = "topk_per_device_state" diff --git a/lib/kernels/include/kernels/pool_2d_kernels.h b/lib/kernels/include/kernels/pool_2d_kernels.h index 76aa07d0a4..c18ff92289 100644 --- a/lib/kernels/include/kernels/pool_2d_kernels.h +++ b/lib/kernels/include/kernels/pool_2d_kernels.h @@ -1,80 +1,52 @@ #ifndef _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H -#include "kernels/device.h" +#include "kernels/device_handle_t.dtg.h" +#include "kernels/device_stream_t.dtg.h" #include "kernels/ff_handle.h" +#include "kernels/pool_2d_per_device_state.dtg.h" #include "op-attrs/activation.dtg.h" #include "op-attrs/ops/pool_2d.h" -#include "utils/visitable.h" - -namespace FlexFlow { - -struct Pool2DPerDeviceState { - PerDeviceFFHandle handle; - ffTensorDescriptor_t inputTensor, outputTensor; - ffActivationDescriptor_t actiDesc; - ffPoolingDescriptor_t poolDesc; - bool relu; -}; - -FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(Pool2DPerDeviceState, - handle, - inputTensor, - outputTensor, - actiDesc, - poolDesc, - relu); - -namespace Kernels::Pool2D { - -Pool2DPerDeviceState init_kernel(PerDeviceFFHandle handle, - std::optional activation, - int input_w, - int input_h, - int input_c, - int input_n, - int output_w, - int output_h, - int output_c, - int output_n, - int pad_h, - int pad_w, - int kernel_h, - int kernel_w, - int stride_h, - int stride_w, - PoolOp pool_type); - -void init_kernel(Pool2DPerDeviceState *m, - int input_w, - int input_h, - int input_c, - int input_n, - int output_w, - int output_h, - int output_c, - int output_n, - int pad_h, - int pad_w, - int kernel_h, - int kernel_w, - int stride_h, - int stride_w, - PoolOp pool_type); - -void forward_kernel(ffStream_t stream, - Pool2DPerDeviceState const &m, +#include "pcg/device_type.dtg.h" + +namespace FlexFlow::Kernels::Pool2D { + +std::optional + init_kernel(DeviceType device_type, + device_handle_t const &handle, + std::optional activation, + int input_w, + int input_h, + int input_c, + int input_n, + int output_w, + int output_h, + int output_c, + int output_n, + int pad_h, + int pad_w, + int kernel_h, + int kernel_w, + int stride_h, + int stride_w, + PoolOp pool_type); + +void forward_kernel(device_stream_t const &stream, + std::optional const &per_device_state, void const *input_ptr, void *output_ptr); -void backward_kernel(ffStream_t stream, - Pool2DPerDeviceState const &m, - void const *output_ptr, - void const *output_grad_ptr, - void const *input_ptr, - void *input_grad_ptr); +void backward_kernel( + device_stream_t const &stream, + std::optional const &per_device_state, + void const *output_ptr, + void const *output_grad_ptr, + void const *input_ptr, + void *input_grad_ptr); + +void cleanup_kernel(DeviceType device_type, + std::optional &per_device_state); -} // namespace Kernels::Pool2D -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Pool2D #endif // _FLEXFLOW_OPS_KERNELS_POOL_2D_KERNELS_H diff --git a/lib/kernels/include/kernels/pool_2d_kernels_cpu.h b/lib/kernels/include/kernels/pool_2d_kernels_cpu.h new file mode 100644 index 0000000000..aa13e913b6 --- /dev/null +++ b/lib/kernels/include/kernels/pool_2d_kernels_cpu.h @@ -0,0 +1,15 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_POOL_2D_KERNELS_CPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_POOL_2D_KERNELS_CPU_H + +namespace FlexFlow::Kernels::Pool2D { + +void cpu_forward_kernel(void const *input_ptr, void *output_ptr); + +void cpu_backward_kernel(void const *output_ptr, + void const *output_grad_ptr, + void const *input_ptr, + void *input_grad_ptr); + +} // namespace FlexFlow::Kernels::Pool2D + +#endif diff --git a/lib/kernels/include/kernels/pool_2d_kernels_gpu.h b/lib/kernels/include/kernels/pool_2d_kernels_gpu.h new file mode 100644 index 0000000000..8a1499e97e --- /dev/null +++ b/lib/kernels/include/kernels/pool_2d_kernels_gpu.h @@ -0,0 +1,46 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_POOL_2D_KERNELS_GPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_POOL_2D_KERNELS_GPU_H + +#include "kernels/device.h" +#include "kernels/ff_handle.h" +#include "kernels/pool_2d_per_device_state.dtg.h" +#include "op-attrs/activation.dtg.h" +#include "op-attrs/ops/pool_2d.h" + +namespace FlexFlow::Kernels::Pool2D { + +Pool2DPerDeviceState gpu_init_kernel(PerDeviceFFHandle handle, + std::optional activation, + int input_w, + int input_h, + int input_c, + int input_n, + int output_w, + int output_h, + int output_c, + int output_n, + int pad_h, + int pad_w, + int kernel_h, + int kernel_w, + int stride_h, + int stride_w, + PoolOp pool_type); + +void gpu_forward_kernel(ffStream_t stream, + Pool2DPerDeviceState const &per_device_state, + void const *input_ptr, + void *output_ptr); + +void gpu_backward_kernel(ffStream_t stream, + Pool2DPerDeviceState const &per_device_state, + void const *output_ptr, + void const *output_grad_ptr, + void const *input_ptr, + void *input_grad_ptr); + +void gpu_cleanup_kernel(Pool2DPerDeviceState &per_device_state); + +} // namespace FlexFlow::Kernels::Pool2D + +#endif diff --git a/lib/kernels/include/kernels/pool_2d_per_device_state.struct.toml b/lib/kernels/include/kernels/pool_2d_per_device_state.struct.toml new file mode 100644 index 0000000000..63e98cca85 --- /dev/null +++ b/lib/kernels/include/kernels/pool_2d_per_device_state.struct.toml @@ -0,0 +1,32 @@ +namespace = "FlexFlow" +name = "Pool2DPerDeviceState" +features = [] + +includes = [ + "kernels/ff_handle.h", + "kernels/device.h", +] + +[[fields]] +name = "handle" +type = "::FlexFlow::PerDeviceFFHandle" + +[[fields]] +name = "inputTensor" +type = "ffTensorDescriptor_t" + +[[fields]] +name = "outputTensor" +type = "ffTensorDescriptor_t" + +[[fields]] +name = "actiDesc" +type = "ffActivationDescriptor_t" + +[[fields]] +name = "poolDesc" +type = "ffPoolingDescriptor_t" + +[[fields]] +name = "relu" +type = "bool" diff --git a/lib/kernels/include/kernels/profiling.h b/lib/kernels/include/kernels/profiling.h index 7c4145c426..c0a0e794e3 100644 --- a/lib/kernels/include/kernels/profiling.h +++ b/lib/kernels/include/kernels/profiling.h @@ -2,21 +2,26 @@ #define _FLEXFLOW_KERNELS_PROFILING_H #include "kernels/device.h" +#include "kernels/device_stream_t.h" #include "kernels/profiling_settings.dtg.h" -#include "utils/visitable.h" +#include "pcg/device_type.dtg.h" +#include namespace FlexFlow { template -std::optional - profiling_wrapper(F const &f, bool enable_profiling, Ts &&...ts) { +std::optional profiling_wrapper(F const &f, + bool enable_profiling, + DeviceType device_type, + Ts &&...ts) { if (enable_profiling) { - ProfilingSettings settings = {0, 1}; + ProfilingSettings settings = ProfilingSettings{ + /*warmup_iters=*/0, + /*measure_iters=*/1, + }; return profiling_wrapper(f, settings, std::forward(ts)...); } else { - ffStream_t stream; - checkCUDA(get_legion_stream(&stream)); - f(stream, std::forward(ts)...); + f(get_stream_for_device_type(device_type), std::forward(ts)...); return std::nullopt; } } @@ -24,9 +29,54 @@ std::optional template std::optional profiling_wrapper(F const &f, ProfilingSettings const &settings, + DeviceType device_type, Ts &&...ts) { - ffStream_t stream; - checkCUDA(get_legion_stream(&stream)); + if (settings.measure_iters <= 0) { + return std::nullopt; + } + + if (device_type == DeviceType::GPU) { + return gpu_profiling_wrapper(f, settings, std::forward(ts)...); + } else { + ASSERT(device_type == DeviceType::CPU); + return cpu_profiling_wrapper(f, settings, std::forward(ts)...); + } +} + +template +float cpu_profiling_wrapper(F const &f, + ProfilingSettings const &settings, + Ts &&...ts) { + ASSERT(settings.measure_iters > 0); + + device_stream_t stream = get_cpu_device_stream(); + + using TimePoint = std::chrono::time_point; + + std::optional start = std::nullopt; + std::optional end = std::nullopt; + + for (int i = 0; i < settings.warmup_iters + settings.measure_iters; i++) { + if (i == settings.warmup_iters) { + start = std::chrono::steady_clock::now(); + } + f(stream, std::forward(ts)...); + } + end = std::chrono::steady_clock::now(); + + std::chrono::duration avg_duration = + (end.value() - start.value()) / settings.measure_iters; + + return avg_duration.count(); +} + +template +float gpu_profiling_wrapper(F const &f, + ProfilingSettings const &settings, + Ts &&...ts) { + ASSERT(settings.measure_iters > 0); + + device_stream_t stream = get_gpu_device_stream(); ffEvent_t t_start, t_end; checkCUDA(ffEventCreate(&t_start)); @@ -34,18 +84,18 @@ std::optional profiling_wrapper(F const &f, for (int i = 0; i < settings.warmup_iters + settings.measure_iters; i++) { if (i == settings.warmup_iters) { - checkCUDA(ffEventRecord(t_start, stream)); + checkCUDA(ffEventRecord(t_start, stream.require_gpu())); } f(stream, std::forward(ts)...); } float elapsed = 0; - checkCUDA(ffEventRecord(t_end, stream)); + checkCUDA(ffEventRecord(t_end, stream.require_gpu())); checkCUDA(ffEventSynchronize(t_end)); checkCUDA(ffEventElapsedTime(&elapsed, t_start, t_end)); checkCUDA(ffEventDestroy(t_start)); checkCUDA(ffEventDestroy(t_end)); - return elapsed; + return elapsed / settings.measure_iters; } } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/reduce_kernels.h b/lib/kernels/include/kernels/reduce_kernels.h index 10e8e4393b..c890ab35c3 100644 --- a/lib/kernels/include/kernels/reduce_kernels.h +++ b/lib/kernels/include/kernels/reduce_kernels.h @@ -1,48 +1,35 @@ #ifndef _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H -#include "kernels/array_shape.h" -#include "kernels/device.h" +#include "kernels/device_handle_t.dtg.h" +#include "kernels/device_stream_t.dtg.h" #include "kernels/ff_handle.h" +#include "kernels/reduce_per_device_state.dtg.h" #include "op-attrs/operator_type.dtg.h" +#include "op-attrs/tensor_shape.dtg.h" +#include "pcg/device_type.dtg.h" -namespace FlexFlow { +namespace FlexFlow::Kernels::Reduce { -struct ReducePerDeviceState { - PerDeviceFFHandle handle; - ffTensorDescriptor_t inputTensor; - ffTensorDescriptor_t outputTensor; - ffReduceTensorDescriptor_t reduceDesc; - OperatorType op_type; - req reduction_size; -}; +std::optional + init_kernel(DeviceType device_type, + device_handle_t const &handle, + OperatorType const &operator_type, + size_t const &reduction_size, + TensorShape const &input_shape, + TensorShape const &output_shape); -FF_VISITABLE_STRUCT(ReducePerDeviceState, - handle, - inputTensor, - outputTensor, - reduceDesc, - op_type, - reduction_size); - -namespace Kernels::Reduce { - -ReducePerDeviceState init_kernel(PerDeviceFFHandle const &, - OperatorType const &, - size_t const &, - ArrayShape const &input_shape, - ArrayShape const &output_shape); - -void forward_kernel(ffStream_t stream, - ReducePerDeviceState const &m, +void forward_kernel(device_stream_t const &stream, + std::optional const &per_device_state, float const *input_ptr, float *output_ptr); -void backward_kernel(ffStream_t stream, - ReducePerDeviceState const &m, - float const *output_grad_ptr, - float *input_grad_ptr); -} // namespace Kernels::Reduce -} // namespace FlexFlow +void backward_kernel( + device_stream_t const &stream, + std::optional const &per_device_state, + float const *output_grad_ptr, + float *input_grad_ptr); + +} // namespace FlexFlow::Kernels::Reduce #endif // _FLEXFLOW_OPS_KERNELS_REDUCE_KERNELS_H diff --git a/lib/kernels/include/kernels/reduce_kernels_cpu.h b/lib/kernels/include/kernels/reduce_kernels_cpu.h new file mode 100644 index 0000000000..9b625f9304 --- /dev/null +++ b/lib/kernels/include/kernels/reduce_kernels_cpu.h @@ -0,0 +1,12 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REDUCE_KERNELS_CPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REDUCE_KERNELS_CPU_H + +namespace FlexFlow::Kernels::Reduce { + +void cpu_forward_kernel(float const *input_ptr, float *output_ptr); + +void cpu_backward_kernel(float const *output_grad_ptr, float *input_grad_ptr); + +} // namespace FlexFlow::Kernels::Reduce + +#endif diff --git a/lib/kernels/include/kernels/reduce_kernels_gpu.h b/lib/kernels/include/kernels/reduce_kernels_gpu.h new file mode 100644 index 0000000000..c0c06fe78b --- /dev/null +++ b/lib/kernels/include/kernels/reduce_kernels_gpu.h @@ -0,0 +1,30 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REDUCE_KERNELS_GPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REDUCE_KERNELS_GPU_H + +#include "kernels/device.h" +#include "kernels/ff_handle.h" +#include "kernels/reduce_per_device_state.dtg.h" +#include "op-attrs/operator_type.dtg.h" +#include "op-attrs/tensor_shape.dtg.h" + +namespace FlexFlow::Kernels::Reduce { + +ReducePerDeviceState gpu_init_kernel(PerDeviceFFHandle const &, + OperatorType const &, + size_t const &, + TensorShape const &input_shape, + TensorShape const &output_shape); + +void gpu_forward_kernel(ffStream_t stream, + ReducePerDeviceState const &m, + float const *input_ptr, + float *output_ptr); + +void gpu_backward_kernel(ffStream_t stream, + ReducePerDeviceState const &m, + float const *output_grad_ptr, + float *input_grad_ptr); + +} // namespace FlexFlow::Kernels::Reduce + +#endif diff --git a/lib/kernels/include/kernels/reduce_per_device_state.struct.toml b/lib/kernels/include/kernels/reduce_per_device_state.struct.toml new file mode 100644 index 0000000000..e82099ad25 --- /dev/null +++ b/lib/kernels/include/kernels/reduce_per_device_state.struct.toml @@ -0,0 +1,33 @@ +namespace = "FlexFlow" +name = "ReducePerDeviceState" +features = [] + +includes = [ + "kernels/device.h", + "kernels/ff_handle.h", + "op-attrs/operator_type.dtg.h", +] + +[[fields]] +name = "handle" +type = "::FlexFlow::PerDeviceFFHandle" + +[[fields]] +name = "inputTensor" +type = "ffTensorDescriptor_t" + +[[fields]] +name = "outputTensor" +type = "ffTensorDescriptor_t" + +[[fields]] +name = "reduceDesc" +type = "ffReduceTensorDescriptor_t" + +[[fields]] +name = "op_type" +type = "::FlexFlow::OperatorType" + +[[fields]] +name = "reduction_size" +type = "size_t" diff --git a/lib/kernels/include/kernels/reduce_tensor_accessor.h b/lib/kernels/include/kernels/reduce_tensor_accessor.h index d803c7ef9b..a06afbf5f6 100644 --- a/lib/kernels/include/kernels/reduce_tensor_accessor.h +++ b/lib/kernels/include/kernels/reduce_tensor_accessor.h @@ -3,10 +3,11 @@ #include "kernels/accessor.h" #include "kernels/allocation.h" -#include "kernels/array_coord.h" #include "kernels/copy_tensor_accessor.h" #include "kernels/datatype_dispatch.h" #include "kernels/local_cpu_allocator.h" +#include "op-attrs/tensor_dims_coord.h" +#include "op-attrs/tensor_shape.h" #include "utils/containers/contains.h" #include "utils/containers/foldl1.h" #include "utils/containers/foldr1.h" @@ -32,18 +33,18 @@ struct CPUReduceTensorAccessorInDims { return contains(dims_to_reduce, dim); }; - std::unordered_map> + std::unordered_map> output_coord_from_input_coord = group_by( - get_array_coord_set(input.shape), - [&](ArrayCoord const &input_coord) { - return array_coord_drop_dims(input_coord, should_drop_dim); + get_tensor_dims_coord_set(input.shape.dims), + [&](TensorDimsCoord const &input_coord) { + return tensor_dims_coord_drop_dims(input_coord, should_drop_dim); }); for (auto const &[output_coord, input_coords] : output_coord_from_input_coord) { std::vector input_values = transform( - sorted(input_coords), [&](ArrayCoord const &input_coord) -> T { - return input.at
(input_coord.ff_ordered); + sorted(input_coords), [&](TensorDimsCoord const &input_coord) -> T { + return input.at
(input_coord); }); T result = foldl1(input_values, f); @@ -51,7 +52,7 @@ struct CPUReduceTensorAccessorInDims { return f(elem, accum); })); - output.at
(output_coord.ff_ordered) = result; + output.at
(output_coord) = result; } } }; @@ -71,13 +72,13 @@ GenericTensorAccessorW return contains(dims, dim); }; - ArrayShape reduced_shape = - array_shape_drop_dims(input.shape, should_drop_dim); - GenericTensorAccessorW output_cpu = cpu_allocator.allocate_tensor( - get_tensor_shape(reduced_shape, input.data_type)); + TensorShape reduced_shape = + tensor_shape_drop_dims(input.shape, should_drop_dim); + GenericTensorAccessorW output_cpu = + cpu_allocator.allocate_tensor(reduced_shape); DataTypeDispatch1{}( - input_cpu.data_type, input_cpu, output_cpu, dims, f); + input_cpu.shape.data_type, input_cpu, output_cpu, dims, f); return copy_tensor_accessor_w(output_cpu, output_allocator); } @@ -88,7 +89,7 @@ real_type_t
F &&f) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); - std::unordered_set input_dims = get_ff_dim_t_set(input.shape); + std::unordered_set input_dims = get_ff_dim_t_set(input.shape.dims); GenericTensorAccessorW reduced = reduce_tensor_accessor_in_dims(input, input_dims, cpu_allocator, f); diff --git a/lib/kernels/include/kernels/reduction_kernels.h b/lib/kernels/include/kernels/reduction_kernels.h deleted file mode 100644 index 08f73cd9ab..0000000000 --- a/lib/kernels/include/kernels/reduction_kernels.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H -#define _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H - -#include "kernels/accessor.h" -#include "kernels/device.h" - -namespace FlexFlow::Kernels::Reduction { - -void forward_kernel(ffStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - size_t num_replicas); - -void backward_kernel(ffStream_t stream, - GenericTensorAccessorR const &output, - GenericTensorAccessorW const &input); - -} // namespace FlexFlow::Kernels::Reduction - -#endif // _FLEXFLOW_OPS_KERNELS_REDUCTION_KERNELS_H diff --git a/lib/kernels/include/kernels/replicate_kernels.h b/lib/kernels/include/kernels/replicate_kernels.h deleted file mode 100644 index 0b113868ee..0000000000 --- a/lib/kernels/include/kernels/replicate_kernels.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H -#define _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H - -#include "kernels/accessor.h" -#include "kernels/device.h" - -namespace FlexFlow::Kernels::Replicate { - -void forward_kernel(ffStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output); - -void backward_kernel(ffStream_t stream, - GenericTensorAccessorR const &output, - GenericTensorAccessorW const &input, - size_t num_replicas); - -} // namespace FlexFlow::Kernels::Replicate - -#endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_H diff --git a/lib/kernels/include/kernels/replicate_kernels_cpu.h b/lib/kernels/include/kernels/replicate_kernels_cpu.h deleted file mode 100644 index 2a2eaa5eb6..0000000000 --- a/lib/kernels/include/kernels/replicate_kernels_cpu.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H -#define _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H - -#include "kernels/accessor.h" -#include "kernels/device.h" - -namespace FlexFlow::Kernels::Replicate { - -void cpu_forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW &output); - -void cpu_backward_kernel(GenericTensorAccessorR const &output, - GenericTensorAccessorW &input, - size_t num_replicas); - -} // namespace FlexFlow::Kernels::Replicate - -#endif // _FLEXFLOW_OPS_KERNELS_REPLICATE_KERNELS_CPU_H diff --git a/lib/kernels/include/kernels/reshape_kernels.h b/lib/kernels/include/kernels/reshape_kernels.h index 88c11d2fb0..310b349473 100644 --- a/lib/kernels/include/kernels/reshape_kernels.h +++ b/lib/kernels/include/kernels/reshape_kernels.h @@ -2,32 +2,18 @@ #define _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H #include "kernels/accessor.h" -#include "kernels/device.h" -#include "utils/required_core.h" +#include "kernels/device_stream_t.dtg.h" -namespace FlexFlow { +namespace FlexFlow::Kernels::Reshape { -struct ReshapePerDeviceState { - req data_type; -}; - -FF_VISITABLE_STRUCT(ReshapePerDeviceState, data_type); - -namespace Kernels::Reshape { - -ReshapePerDeviceState init_kernel(DataType data_type); - -void forward_kernel(ffStream_t stream, - ReshapePerDeviceState const &per_device_state, +void forward_kernel(device_stream_t const &stream, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); -void backward_kernel(ffStream_t stream, - ReshapePerDeviceState const &per_device_state, +void backward_kernel(device_stream_t const &stream, GenericTensorAccessorR const &output, GenericTensorAccessorW const &input); -} // namespace Kernels::Reshape -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Reshape #endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H diff --git a/lib/kernels/include/kernels/reshape_kernels_cpu.h b/lib/kernels/include/kernels/reshape_kernels_cpu.h new file mode 100644 index 0000000000..a81ea639f6 --- /dev/null +++ b/lib/kernels/include/kernels/reshape_kernels_cpu.h @@ -0,0 +1,16 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_RESHAPE_KERNELS_CPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_RESHAPE_KERNELS_CPU_H + +#include "kernels/accessor.h" + +namespace FlexFlow::Kernels::Reshape { + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); + +} // namespace FlexFlow::Kernels::Reshape + +#endif diff --git a/lib/kernels/include/kernels/reshape_kernels_gpu.h b/lib/kernels/include/kernels/reshape_kernels_gpu.h new file mode 100644 index 0000000000..1454ce56ee --- /dev/null +++ b/lib/kernels/include/kernels/reshape_kernels_gpu.h @@ -0,0 +1,19 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_RESHAPE_KERNELS_GPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_RESHAPE_KERNELS_GPU_H + +#include "kernels/accessor.h" +#include "kernels/device.h" + +namespace FlexFlow::Kernels::Reshape { + +void gpu_forward_kernel(ffStream_t stream, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void gpu_backward_kernel(ffStream_t stream, + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input); + +} // namespace FlexFlow::Kernels::Reshape + +#endif diff --git a/lib/kernels/include/kernels/reverse_kernels.h b/lib/kernels/include/kernels/reverse_kernels.h index 768707175c..bfef26798c 100644 --- a/lib/kernels/include/kernels/reverse_kernels.h +++ b/lib/kernels/include/kernels/reverse_kernels.h @@ -1,17 +1,18 @@ #ifndef _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_H -#include "kernels/device.h" -#include "kernels/reverse_kernels_cpu.h" +#include "kernels/accessor.h" +#include "kernels/device_stream_t.dtg.h" +#include "op-attrs/ops/reverse_attrs.dtg.h" namespace FlexFlow::Kernels::Reverse { -void forward_kernel(ffStream_t stream, +void forward_kernel(device_stream_t const &stream, GenericTensorAccessorR const &input_accessor, GenericTensorAccessorW &output_accessor, ReverseAttrs const &); -void backward_kernel(ffStream_t stream, +void backward_kernel(device_stream_t const &stream, GenericTensorAccessorR const &output_accessor, GenericTensorAccessorW &input_accessor, ReverseAttrs const &); diff --git a/lib/kernels/include/kernels/reverse_kernels_cpu.h b/lib/kernels/include/kernels/reverse_kernels_cpu.h index ec82000f8f..582b167d67 100644 --- a/lib/kernels/include/kernels/reverse_kernels_cpu.h +++ b/lib/kernels/include/kernels/reverse_kernels_cpu.h @@ -2,7 +2,6 @@ #define _FLEXFLOW_OPS_KERNELS_REVERSE_KERNELS_CPU_H #include "kernels/accessor.h" -#include "kernels/device.h" #include "op-attrs/ops/reverse_attrs.dtg.h" namespace FlexFlow::Kernels::Reverse { diff --git a/lib/kernels/include/kernels/reverse_kernels_gpu.h b/lib/kernels/include/kernels/reverse_kernels_gpu.h new file mode 100644 index 0000000000..32f256392f --- /dev/null +++ b/lib/kernels/include/kernels/reverse_kernels_gpu.h @@ -0,0 +1,22 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REVERSE_KERNELS_GPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REVERSE_KERNELS_GPU_H + +#include "kernels/accessor.h" +#include "kernels/device.h" +#include "op-attrs/ops/reverse_attrs.dtg.h" + +namespace FlexFlow::Kernels::Reverse { + +void gpu_forward_kernel(ffStream_t stream, + GenericTensorAccessorR const &input_accessor, + GenericTensorAccessorW &output_accessor, + ReverseAttrs const &); + +void gpu_backward_kernel(ffStream_t stream, + GenericTensorAccessorR const &output_accessor, + GenericTensorAccessorW &input_accessor, + ReverseAttrs const &); + +} // namespace FlexFlow::Kernels::Reverse + +#endif diff --git a/lib/kernels/include/kernels/reverse_kernels_params.h b/lib/kernels/include/kernels/reverse_kernels_params.h index 766d70b915..a2611f5aef 100644 --- a/lib/kernels/include/kernels/reverse_kernels_params.h +++ b/lib/kernels/include/kernels/reverse_kernels_params.h @@ -1,14 +1,14 @@ #ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REVERSE_KERNELS_PARAMS_H #define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_REVERSE_KERNELS_PARAMS_H -#include "kernels/array_shape.h" #include "kernels/reverse_kernels_params.dtg.h" #include "op-attrs/ops/reverse_attrs.dtg.h" +#include "op-attrs/tensor_dims.dtg.h" namespace FlexFlow { ReverseKernelsParams - compute_reverse_kernels_params(ArrayShape const &output_shape, + compute_reverse_kernels_params(TensorDims const &output_dims, ReverseAttrs const &attrs); } // namespace FlexFlow diff --git a/lib/kernels/include/kernels/softmax_kernels.h b/lib/kernels/include/kernels/softmax_kernels.h index 60101578e3..23f0ff879d 100644 --- a/lib/kernels/include/kernels/softmax_kernels.h +++ b/lib/kernels/include/kernels/softmax_kernels.h @@ -1,40 +1,36 @@ #ifndef _FLEXFLOW_OPS_KERNELS_SOFTMAX_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_SOFTMAX_KERNELS_H -#include "ff_handle.h" -#include "kernels/device.h" - -namespace FlexFlow { - -// Note(lambda): SoftmaxPerDeviceState may need add more elements -struct SoftmaxPerDeviceState { - PerDeviceFFHandle handle; - ffTensorDescriptor_t inputTensor; - req dim; -}; - -FF_VISITABLE_STRUCT(SoftmaxPerDeviceState, handle, inputTensor, dim); - -namespace Kernels::Softmax { - -SoftmaxPerDeviceState init_kernel(PerDeviceFFHandle const &handle, - int dim, - int input_n, - int input_c, - int input_h, - int input_w); - -void forward_kernel(ffStream_t stream, - SoftmaxPerDeviceState const &m, - float const *input_ptr, - float *output_ptr); - -void backward_kernel(ffStream_t stream, +#include "kernels/device_handle_t.dtg.h" +#include "kernels/device_stream_t.dtg.h" +#include "kernels/ff_handle.h" +#include "kernels/softmax_per_device_state.dtg.h" +#include "pcg/device_type.dtg.h" + +namespace FlexFlow::Kernels::Softmax { + +std::optional init_kernel(DeviceType device_type, + device_handle_t const &handle, + ff_dim_t dim, + int input_n, + int input_c, + int input_h, + int input_w); + +void forward_kernel( + device_stream_t const &stream, + std::optional const &per_device_state, + float const *input_ptr, + float *output_ptr); + +void backward_kernel(device_stream_t const &stream, float const *output_grad_ptr, float *input_grad_ptr, size_t num_elements); -} // namespace Kernels::Softmax -} // namespace FlexFlow +void cleanup_kernel(DeviceType device_type, + std::optional &per_device_state); + +} // namespace FlexFlow::Kernels::Softmax #endif diff --git a/lib/kernels/include/kernels/softmax_kernels_cpu.h b/lib/kernels/include/kernels/softmax_kernels_cpu.h new file mode 100644 index 0000000000..536a28e62c --- /dev/null +++ b/lib/kernels/include/kernels/softmax_kernels_cpu.h @@ -0,0 +1,16 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_SOFTMAX_KERNELS_CPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_SOFTMAX_KERNELS_CPU_H + +#include + +namespace FlexFlow::Kernels::Softmax { + +void cpu_forward_kernel(float const *input_ptr, float *output_ptr); + +void cpu_backward_kernel(float const *output_grad_ptr, + float *input_grad_ptr, + size_t num_elements); + +} // namespace FlexFlow::Kernels::Softmax + +#endif diff --git a/lib/kernels/include/kernels/softmax_kernels_gpu.h b/lib/kernels/include/kernels/softmax_kernels_gpu.h new file mode 100644 index 0000000000..16e98857f4 --- /dev/null +++ b/lib/kernels/include/kernels/softmax_kernels_gpu.h @@ -0,0 +1,32 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_SOFTMAX_KERNELS_GPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_SOFTMAX_KERNELS_GPU_H + +#include "kernels/device.h" +#include "kernels/ff_handle.h" +#include "kernels/softmax_per_device_state.dtg.h" +#include "op-attrs/ff_dim_t.dtg.h" + +namespace FlexFlow::Kernels::Softmax { + +SoftmaxPerDeviceState gpu_init_kernel(PerDeviceFFHandle const &handle, + ff_dim_t dim, + int input_n, + int input_c, + int input_h, + int input_w); + +void gpu_forward_kernel(ffStream_t stream, + SoftmaxPerDeviceState const &per_device_state, + float const *input_ptr, + float *output_ptr); + +void gpu_backward_kernel(ffStream_t stream, + float const *output_grad_ptr, + float *input_grad_ptr, + size_t num_elements); + +void gpu_cleanup_kernel(SoftmaxPerDeviceState &per_device_state); + +} // namespace FlexFlow::Kernels::Softmax + +#endif diff --git a/lib/kernels/include/kernels/softmax_per_device_state.struct.toml b/lib/kernels/include/kernels/softmax_per_device_state.struct.toml new file mode 100644 index 0000000000..374dd28c63 --- /dev/null +++ b/lib/kernels/include/kernels/softmax_per_device_state.struct.toml @@ -0,0 +1,21 @@ +namespace = "FlexFlow" +name = "SoftmaxPerDeviceState" +features = [] + +includes = [ + "kernels/ff_handle.h", + "kernels/device.h", + "op-attrs/ff_dim_t.dtg.h", +] + +[[fields]] +name = "handle" +type = "::FlexFlow::PerDeviceFFHandle" + +[[fields]] +name = "inputTensor" +type = "ffTensorDescriptor_t" + +[[fields]] +name = "dim" +type = "::FlexFlow::ff_dim_t" diff --git a/lib/kernels/include/kernels/split_kernels.h b/lib/kernels/include/kernels/split_kernels.h index 3b580f94be..6c3d576f29 100644 --- a/lib/kernels/include/kernels/split_kernels.h +++ b/lib/kernels/include/kernels/split_kernels.h @@ -1,22 +1,24 @@ #ifndef _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_SPLIT_KERNELS_H -#include "kernels/device.h" +#include "kernels/device_stream_t.dtg.h" namespace FlexFlow::Kernels::Split { -void forward_kernel(ffStream_t stream, + +void forward_kernel(device_stream_t const &stream, float **out_ptrs, float const *in_ptr, - coord_t const *out_blk_sizes, - coord_t in_blk_size, - coord_t num_blks, + int const *out_blk_sizes, + int in_blk_size, + int num_blks, int numOutputs); -void backward_kernel(ffStream_t stream, + +void backward_kernel(device_stream_t const &stream, float *in_grad_ptr, float const **out_grad_ptr, - coord_t const *out_blk_sizes, - coord_t in_blk_size, - coord_t num_blks, + int const *out_blk_sizes, + int in_blk_size, + int num_blks, int numOutputs); } // namespace FlexFlow::Kernels::Split diff --git a/lib/kernels/include/kernels/split_kernels_cpu.h b/lib/kernels/include/kernels/split_kernels_cpu.h new file mode 100644 index 0000000000..7f50804dff --- /dev/null +++ b/lib/kernels/include/kernels/split_kernels_cpu.h @@ -0,0 +1,22 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_SPLIT_KERNELS_CPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_SPLIT_KERNELS_CPU_H + +namespace FlexFlow::Kernels::Split { + +void cpu_forward_kernel(float **out_ptrs, + float const *in_ptr, + int const *out_blk_sizes, + int in_blk_size, + int num_blks, + int numOutputs); + +void cpu_backward_kernel(float *in_grad_ptr, + float const **out_grad_ptr, + int const *out_blk_sizes, + int in_blk_size, + int num_blks, + int numOutputs); + +} // namespace FlexFlow::Kernels::Split + +#endif diff --git a/lib/kernels/include/kernels/split_kernels_gpu.h b/lib/kernels/include/kernels/split_kernels_gpu.h new file mode 100644 index 0000000000..e6bfc5454c --- /dev/null +++ b/lib/kernels/include/kernels/split_kernels_gpu.h @@ -0,0 +1,26 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_SPLIT_KERNELS_GPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_SPLIT_KERNELS_GPU_H + +#include "kernels/device.h" + +namespace FlexFlow::Kernels::Split { + +void gpu_forward_kernel(ffStream_t stream, + float **out_ptrs, + float const *in_ptr, + int const *out_blk_sizes, + int in_blk_size, + int num_blks, + int numOutputs); + +void gpu_backward_kernel(ffStream_t stream, + float *in_grad_ptr, + float const **out_grad_ptr, + int const *out_blk_sizes, + int in_blk_size, + int num_blks, + int numOutputs); + +} // namespace FlexFlow::Kernels::Split + +#endif diff --git a/lib/kernels/include/kernels/tensor_accessor_binary_ops.h b/lib/kernels/include/kernels/tensor_accessor_binary_ops.h new file mode 100644 index 0000000000..dde51b3266 --- /dev/null +++ b/lib/kernels/include/kernels/tensor_accessor_binary_ops.h @@ -0,0 +1,48 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_TENSOR_ACCESSOR_BINARY_OPS_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_TENSOR_ACCESSOR_BINARY_OPS_H + +#include "kernels/accessor.h" +#include "kernels/allocation.h" + +namespace FlexFlow { + +GenericTensorAccessorW + tensor_accessor_elementwise_add(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &output_allocator); + +void tensor_accessor_elementwise_add_to(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + GenericTensorAccessorW const &output); + +GenericTensorAccessorW + tensor_accessor_elementwise_subtract(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &output_allocator); + +void tensor_accessor_elementwise_subtract_to( + GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + GenericTensorAccessorW const &output); + +GenericTensorAccessorW + tensor_accessor_elementwise_multiply(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &output_allocator); + +void tensor_accessor_elementwise_multiply_to( + GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + GenericTensorAccessorW const &output); + +GenericTensorAccessorW tensor_accessor_matmul(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &output_allocator); + +void tensor_accessor_matmul_to(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + GenericTensorAccessorW const &output); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/tensor_accessor_unary_ops.h b/lib/kernels/include/kernels/tensor_accessor_unary_ops.h new file mode 100644 index 0000000000..b7bb561e4a --- /dev/null +++ b/lib/kernels/include/kernels/tensor_accessor_unary_ops.h @@ -0,0 +1,50 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_TENSOR_ACCESSOR_UNARY_OPS_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_TENSOR_ACCESSOR_UNARY_OPS_H + +#include "kernels/accessor.h" +#include "kernels/allocation.h" + +namespace FlexFlow { + +GenericTensorAccessorW + tensor_accessor_scale_by_constant(GenericTensorAccessorR const &input, + float constant, + Allocator &output_allocator); + +void tensor_accessor_scale_by_constant_inplace( + GenericTensorAccessorW const &input, float constant); + +GenericTensorAccessorW tensor_accessor_relu(GenericTensorAccessorR const &input, + Allocator &output_allocator); + +void tensor_accessor_relu_to(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +GenericTensorAccessorW + tensor_accessor_broadcast(GenericTensorAccessorR const &input, + TensorDims const &output_dims, + Allocator &output_allocator); + +void tensor_accessor_broadcast_to(GenericTensorAccessorR const &input, + TensorDims const &output_dims, + GenericTensorAccessorW const &output); + +GenericTensorAccessorW + tensor_accessor_transpose(GenericTensorAccessorR const &input, + Allocator &output_allocator); + +void tensor_accessor_transpose_to(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +GenericTensorAccessorW + tensor_accessor_reduce(GenericTensorAccessorR const &input, + ff_dim_t dim, + Allocator &output_allocator); + +void tensor_accessor_reduce_to(GenericTensorAccessorR const &input, + ff_dim_t dim, + GenericTensorAccessorW const &output); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/topk_kernels.h b/lib/kernels/include/kernels/topk_kernels.h index 085594d57f..af9de11736 100644 --- a/lib/kernels/include/kernels/topk_kernels.h +++ b/lib/kernels/include/kernels/topk_kernels.h @@ -1,23 +1,11 @@ #ifndef _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H #define _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H -#include "kernels/allocation.h" -#include "kernels/device.h" +#include "kernels/device_stream_t.dtg.h" -namespace FlexFlow { +namespace FlexFlow::Kernels::TopK { -struct TopKPerDeviceState { - req sorted; // Note: Does TopK needs a PerDeviceFFHandle handle? -}; - -FF_VISITABLE_STRUCT(TopKPerDeviceState, sorted); - -namespace Kernels::TopK { - -TopKPerDeviceState init_kernel(bool sorted); - -void forward_kernel(ffStream_t stream, - TopKPerDeviceState const &m, +void forward_kernel(device_stream_t const &stream, float const *input_ptr, float *output_ptr, int *indices_ptr, @@ -25,8 +13,8 @@ void forward_kernel(ffStream_t stream, int length, int k, bool sorted); -void backward_kernel(ffStream_t stream, - TopKPerDeviceState const &m, + +void backward_kernel(device_stream_t const &stream, float const *out_grad_ptr, int const *indices_ptr, float *in_grad_ptr, @@ -34,7 +22,6 @@ void backward_kernel(ffStream_t stream, int length, int k); -} // namespace Kernels::TopK -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::TopK #endif // _FLEXFLOW_OPS_KERNELS_TOPK_KERNELS_H diff --git a/lib/kernels/include/kernels/topk_kernels_cpu.h b/lib/kernels/include/kernels/topk_kernels_cpu.h new file mode 100644 index 0000000000..a3764c40dd --- /dev/null +++ b/lib/kernels/include/kernels/topk_kernels_cpu.h @@ -0,0 +1,25 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_TOPK_KERNELS_CPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_TOPK_KERNELS_CPU_H + +#include + +namespace FlexFlow::Kernels::TopK { + +void cpu_forward_kernel(float const *input_ptr, + float *output_ptr, + int *indices_ptr, + size_t batch_size, + int length, + int k, + bool sorted); + +void cpu_backward_kernel(float const *out_grad_ptr, + int const *indices_ptr, + float *in_grad_ptr, + size_t batch_size, + int length, + int k); + +} // namespace FlexFlow::Kernels::TopK + +#endif diff --git a/lib/kernels/include/kernels/topk_kernels_gpu.h b/lib/kernels/include/kernels/topk_kernels_gpu.h new file mode 100644 index 0000000000..e669e79048 --- /dev/null +++ b/lib/kernels/include/kernels/topk_kernels_gpu.h @@ -0,0 +1,27 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_TOPK_KERNELS_GPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_TOPK_KERNELS_GPU_H + +#include "kernels/device.h" + +namespace FlexFlow::Kernels::TopK { + +void gpu_forward_kernel(ffStream_t stream, + float const *input_ptr, + float *output_ptr, + int *indices_ptr, + size_t batch_size, + int length, + int k, + bool sorted); + +void gpu_backward_kernel(ffStream_t stream, + float const *out_grad_ptr, + int const *indices_ptr, + float *in_grad_ptr, + size_t batch_size, + int length, + int k); + +} // namespace FlexFlow::Kernels::TopK + +#endif diff --git a/lib/kernels/include/kernels/transpose_kernels.h b/lib/kernels/include/kernels/transpose_kernels.h index 776370dcbd..96b0a9c4aa 100644 --- a/lib/kernels/include/kernels/transpose_kernels.h +++ b/lib/kernels/include/kernels/transpose_kernels.h @@ -2,25 +2,21 @@ #define _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H #include "kernels/accessor.h" -#include "kernels/device.h" +#include "kernels/device_stream_t.dtg.h" #include "op-attrs/ops/transpose_attrs.dtg.h" -#include -namespace FlexFlow { +namespace FlexFlow::Kernels::Transpose { -namespace Kernels::Transpose { - -void forward_kernel(cudaStream_t stream, +void forward_kernel(device_stream_t const &stream, TransposeAttrs const &attrs, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output); -void backward_kernel(cudaStream_t stream, +void backward_kernel(device_stream_t const &stream, TransposeAttrs const &attrs, GenericTensorAccessorR const &out_grad, GenericTensorAccessorW const &in_grad); -} // namespace Kernels::Transpose -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::Transpose #endif // _FLEXFLOW_OPS_KERNELS_TRANSPOSE_KERNELS_H diff --git a/lib/kernels/include/kernels/transpose_kernels_cpu.h b/lib/kernels/include/kernels/transpose_kernels_cpu.h new file mode 100644 index 0000000000..dd8963d5e4 --- /dev/null +++ b/lib/kernels/include/kernels/transpose_kernels_cpu.h @@ -0,0 +1,19 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_TRANSPOSE_KERNELS_CPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_TRANSPOSE_KERNELS_CPU_H + +#include "kernels/accessor.h" +#include "op-attrs/ops/transpose_attrs.dtg.h" + +namespace FlexFlow::Kernels::Transpose { + +void cpu_forward_kernel(TransposeAttrs const &attrs, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void cpu_backward_kernel(TransposeAttrs const &attrs, + GenericTensorAccessorR const &out_grad, + GenericTensorAccessorW const &in_grad); + +} // namespace FlexFlow::Kernels::Transpose + +#endif diff --git a/lib/kernels/include/kernels/transpose_kernels_gpu.h b/lib/kernels/include/kernels/transpose_kernels_gpu.h new file mode 100644 index 0000000000..67f6e48665 --- /dev/null +++ b/lib/kernels/include/kernels/transpose_kernels_gpu.h @@ -0,0 +1,22 @@ +#ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_TRANSPOSE_KERNELS_GPU_H +#define _FLEXFLOW_LIB_KERNELS_INCLUDE_KERNELS_TRANSPOSE_KERNELS_GPU_H + +#include "kernels/accessor.h" +#include "kernels/device.h" +#include "op-attrs/ops/transpose_attrs.dtg.h" + +namespace FlexFlow::Kernels::Transpose { + +void gpu_forward_kernel(ffStream_t stream, + TransposeAttrs const &attrs, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output); + +void gpu_backward_kernel(ffStream_t stream, + TransposeAttrs const &attrs, + GenericTensorAccessorR const &out_grad, + GenericTensorAccessorW const &in_grad); + +} // namespace FlexFlow::Kernels::Transpose + +#endif diff --git a/lib/kernels/src/cpu/ops/combine_kernels.cc b/lib/kernels/src/cpu/ops/combine_kernels.cc deleted file mode 100644 index c0c856ae5b..0000000000 --- a/lib/kernels/src/cpu/ops/combine_kernels.cc +++ /dev/null @@ -1,40 +0,0 @@ -#include "kernels/combine_kernels_cpu.h" -#include "kernels/datatype_dispatch.h" - -namespace FlexFlow::Kernels::Combine { - -template -struct CPUForwardKernel { - void operator()(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - memcpy(output.get
(), - input.get
(), - input.shape.num_elements().int_from_positive_int() * - size_of_datatype(DT).int_from_positive_int()); - } -}; - -template -struct CPUBackwardKernel { - void operator()(GenericTensorAccessorR const &output_grad, - GenericTensorAccessorW const &input_grad) { - size_t num_elements = - output_grad.shape.num_elements().int_from_positive_int(); - for (int i = 0; i < num_elements; ++i) { - input_grad.get
()[i] += output_grad.get
()[i]; - } - } -}; - -void cpu_forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - DataTypeDispatch1{}(input.data_type, input, output); -} - -void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, - GenericTensorAccessorW const &input_grad) { - DataTypeDispatch1{}( - input_grad.data_type, output_grad, input_grad); -} - -} // namespace FlexFlow::Kernels::Combine diff --git a/lib/kernels/src/cpu/ops/initializer_kernels.cc b/lib/kernels/src/cpu/ops/initializer_kernels.cc index c7f43b5762..94eac25fdf 100644 --- a/lib/kernels/src/cpu/ops/initializer_kernels.cc +++ b/lib/kernels/src/cpu/ops/initializer_kernels.cc @@ -9,14 +9,14 @@ template struct ZeroInitKernel { void operator()(GenericTensorAccessorW const &tensor) const { auto arr = get
(tensor); - for (size_t i = 0; i < get_num_elements(tensor.shape); i++) { + for (size_t i = 0; i < get_num_elements(tensor.shape.dims); i++) { arr[i] = 0.0f; } } }; void zero_init_kernel_cpu(GenericTensorAccessorW const &tensor) { - DataTypeDispatch1{}(tensor.data_type, tensor); + DataTypeDispatch1{}(tensor.shape.data_type, tensor); } template @@ -25,7 +25,7 @@ struct ConstantInitKernel { DataTypeValue value) const { auto arr = get
(tensor); auto unwrapped_value = value.get>(); - for (size_t i = 0; i < get_num_elements(tensor.shape); i++) { + for (size_t i = 0; i < get_num_elements(tensor.shape.dims); i++) { arr[i] = unwrapped_value; } } @@ -33,7 +33,8 @@ struct ConstantInitKernel { void constant_init_kernel_cpu(GenericTensorAccessorW const &tensor, DataTypeValue value) { - DataTypeDispatch1{}(tensor.data_type, tensor, value); + DataTypeDispatch1{}( + tensor.shape.data_type, tensor, value); } void zero_init_kernel(TaskLocation const &loc, diff --git a/lib/kernels/src/cpu/ops/replicate_kernels.cc b/lib/kernels/src/cpu/ops/replicate_kernels.cc deleted file mode 100644 index bc9c4eab0d..0000000000 --- a/lib/kernels/src/cpu/ops/replicate_kernels.cc +++ /dev/null @@ -1,53 +0,0 @@ -#include "kernels/datatype_dispatch.h" -#include "kernels/replicate_kernels_cpu.h" -#include "utils/nonnegative_int/nonnegative_range.h" - -namespace FlexFlow::Kernels::Replicate { - -template -struct CPUForwardKernel { - void operator()(GenericTensorAccessorR const &input, - GenericTensorAccessorW &output) { - memcpy(output.get
(), - input.get
(), - input.shape.num_elements().int_from_positive_int() * - size_of_datatype(DT).int_from_positive_int()); - } -}; - -template -struct CPUBackwardKernel { - void operator()(GenericTensorAccessorR const &output, - GenericTensorAccessorW &input, - positive_int num_elements, - nonnegative_int num_replicas) { - using T = real_type_t
; - - for (nonnegative_int i : - nonnegative_range(num_elements.nonnegative_int_from_positive_int())) { - T cur_sum = 0; - for (nonnegative_int replica_idx : nonnegative_range(num_replicas)) { - cur_sum += output.at
(LegionOrdered{replica_idx, i}); - } - input.at
(LegionOrdered{i}) = cur_sum; - } - } -}; - -void cpu_forward_kernel(GenericTensorAccessorR const &input, - GenericTensorAccessorW &output) { - DataTypeDispatch1{}(input.data_type, input, output); -} - -void cpu_backward_kernel(GenericTensorAccessorR const &output, - GenericTensorAccessorW &input, - size_t num_replicas) { - positive_int num_elements = input.shape.num_elements(); - DataTypeDispatch1{}(input.data_type, - output, - input, - num_elements, - nonnegative_int{num_replicas}); -} - -} // namespace FlexFlow::Kernels::Replicate diff --git a/lib/kernels/src/cuda/cuda_helper.cu b/lib/kernels/src/cuda/cuda_helper.cu index 98faadf5ac..cd89945579 100644 --- a/lib/kernels/src/cuda/cuda_helper.cu +++ b/lib/kernels/src/cuda/cuda_helper.cu @@ -1,5 +1,6 @@ #include "internal/device.h" #include "kernels/datatype_dispatch.h" +#include "op-attrs/tensor_dims.h" #include "utils/containers/reversed.h" namespace FlexFlow { @@ -217,31 +218,44 @@ __host__ void checkCUDA(cudaFreeHost(host_ptr)); } -ffStatus_t - cudnnSetTensorDescriptorFromArrayShape(cudnnTensorDescriptor_t tensor, - ArrayShape const &shape) { - return cudnnSetTensor4dDescriptor( - tensor, - CUDNN_TENSOR_NCHW, - CUDNN_DATA_FLOAT, - shape.at_maybe(legion_dim_t{0_n}).value_or(1_p).int_from_positive_int(), - shape.at_maybe(legion_dim_t{1_n}).value_or(1_p).int_from_positive_int(), - shape.at_maybe(legion_dim_t{2_n}).value_or(1_p).int_from_positive_int(), - shape.at_maybe(legion_dim_t{3_n}).value_or(1_p).int_from_positive_int()); -} - -cudnnDataType_t ff_to_cudnn_datatype(DataType type) { - switch (type) { +ffCudnnDataType_t ff_to_cudnn_datatype(DataType flexflow_data_type) { + switch (flexflow_data_type) { + case DataType::BOOL: + return CUDNN_DATA_BOOLEAN; + case DataType::INT32: + return CUDNN_DATA_INT32; + case DataType::INT64: + return CUDNN_DATA_INT64; + case DataType::HALF: + return CUDNN_DATA_HALF; case DataType::FLOAT: return CUDNN_DATA_FLOAT; case DataType::DOUBLE: return CUDNN_DATA_DOUBLE; - case DataType::INT32: - return CUDNN_DATA_INT32; default: - assert(false && "Unsupported cudnn data type"); + PANIC("Unhandled DataType value", flexflow_data_type); } - return CUDNN_DATA_FLOAT; +} + +ffStatus_t + cudnnSetTensorDescriptorFromTensorShape(cudnnTensorDescriptor_t tensor, + TensorShape const &shape) { + return cudnnSetTensor4dDescriptor( + tensor, + CUDNN_TENSOR_NCHW, + ff_to_cudnn_datatype(shape.data_type), + try_dim_at_idx(shape.dims, relative_ff_dim_t{3}) + .value_or(1_p) + .int_from_positive_int(), + try_dim_at_idx(shape.dims, relative_ff_dim_t{3}) + .value_or(1_p) + .int_from_positive_int(), + try_dim_at_idx(shape.dims, relative_ff_dim_t{3}) + .value_or(1_p) + .int_from_positive_int(), + try_dim_at_idx(shape.dims, relative_ff_dim_t{3}) + .value_or(1_p) + .int_from_positive_int()); } cudaDataType_t ff_to_cuda_datatype(DataType type) { diff --git a/lib/kernels/src/cuda/embedding_kernels.cu b/lib/kernels/src/cuda/embedding_kernels.cu index a7e28c6297..be6bcb4ffc 100644 --- a/lib/kernels/src/cuda/embedding_kernels.cu +++ b/lib/kernels/src/cuda/embedding_kernels.cu @@ -15,10 +15,17 @@ #include "internal/device.h" #include "kernels/datatype_dispatch.h" -#include "kernels/embedding_kernels.h" +#include "kernels/embedding_kernels_gpu.h" namespace FlexFlow::Kernels::Embedding { +template +__global__ void rand_generate_int(TD *ptr, size_t size, TD p) { + CUDA_KERNEL_LOOP(i, size) { + ptr[i] = i % p; + } +} + void rand_generate_int64_wrapper(int64_t *ptr, size_t size, int64_t p) { cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); @@ -309,13 +316,6 @@ __global__ void embed_backward_with_aggr(int64_t const *input, } } -template -__global__ void rand_generate_int(TD *ptr, size_t size, TD p) { - CUDA_KERNEL_LOOP(i, size) { - ptr[i] = i % p; - } -} - template struct ForwardKernel { void operator()(cudaStream_t stream, @@ -343,7 +343,8 @@ struct ForwardKernel { int batch_size) { if (!aggr.has_value()) { embed_forward_no_aggr - <<>>(input.get(), @@ -354,7 +355,8 @@ struct ForwardKernel { } else { assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); embed_forward_with_aggr - <<>>(input.get(), @@ -380,7 +382,8 @@ struct ForwardKernel { int batch_size) { if (!aggr.has_value()) { embed_forward_no_aggr - <<>>(input.get(), @@ -391,7 +394,8 @@ struct ForwardKernel { } else { assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); embed_forward_with_aggr - <<>>(input.get(), @@ -417,7 +421,8 @@ struct ForwardKernel { int batch_size) { if (!aggr.has_value()) { embed_forward_no_aggr - <<>>(input.get(), @@ -428,7 +433,8 @@ struct ForwardKernel { } else { assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); embed_forward_with_aggr - <<>>(input.get(), @@ -454,7 +460,8 @@ struct ForwardKernel { int batch_size) { if (!aggr.has_value()) { embed_forward_no_aggr - <<>>(input.get(), @@ -465,7 +472,8 @@ struct ForwardKernel { } else { assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); embed_forward_with_aggr - <<>>(input.get(), @@ -491,7 +499,8 @@ struct ForwardKernel { int batch_size) { if (!aggr.has_value()) { embed_forward_no_aggr - <<>>(input.get(), @@ -502,7 +511,8 @@ struct ForwardKernel { } else { assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); embed_forward_with_aggr - <<>>(input.get(), @@ -528,7 +538,8 @@ struct ForwardKernel { int batch_size) { if (!aggr.has_value()) { embed_forward_no_aggr - <<>>(input.get(), @@ -539,7 +550,8 @@ struct ForwardKernel { } else { assert(aggr == AggregateOp::AVG || aggr == AggregateOp::SUM); embed_forward_with_aggr - <<>>(input.get(), @@ -580,7 +592,8 @@ struct BackwardKernel { int batch_size) { if (!aggr.has_value()) { embed_backward_no_aggr - <<>>(input.get(), @@ -590,7 +603,8 @@ struct BackwardKernel { batch_size); } else { embed_backward_with_aggr - <<>>(input.get(), @@ -616,7 +630,8 @@ struct BackwardKernel { int batch_size) { if (!aggr.has_value()) { embed_backward_no_aggr - <<>>(input.get(), @@ -626,7 +641,8 @@ struct BackwardKernel { batch_size); } else { embed_backward_with_aggr - <<>>(input.get(), @@ -652,7 +668,8 @@ struct BackwardKernel { int batch_size) { if (!aggr.has_value()) { embed_backward_no_aggr - <<>>(input.get(), @@ -662,7 +679,8 @@ struct BackwardKernel { batch_size); } else { embed_backward_with_aggr - <<>>(input.get(), @@ -688,7 +706,8 @@ struct BackwardKernel { int batch_size) { if (!aggr.has_value()) { embed_backward_no_aggr - <<>>(input.get(), @@ -698,7 +717,8 @@ struct BackwardKernel { batch_size); } else { embed_backward_with_aggr - <<>>(input.get(), @@ -724,7 +744,8 @@ struct BackwardKernel { int batch_size) { if (!aggr.has_value()) { embed_backward_no_aggr - <<>>(input.get(), @@ -734,7 +755,8 @@ struct BackwardKernel { batch_size); } else { embed_backward_with_aggr - <<>>(input.get(), @@ -760,7 +782,8 @@ struct BackwardKernel { int batch_size) { if (!aggr.has_value()) { embed_backward_no_aggr - <<>>(input.get(), @@ -770,7 +793,8 @@ struct BackwardKernel { batch_size); } else { embed_backward_with_aggr - <<>>(input.get(), @@ -784,16 +808,16 @@ struct BackwardKernel { } }; -void forward_kernel(ffStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - GenericTensorAccessorR const &weight, - DataType input_data_type, - DataType output_data_type, - std::optional aggr, - int in_dim, - int out_dim, - int batch_size) { +void gpu_forward_kernel(ffStream_t stream, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + DataType input_data_type, + DataType output_data_type, + std::optional aggr, + int in_dim, + int out_dim, + int batch_size) { DataTypeDispatch2{}(input_data_type, output_data_type, stream, @@ -806,16 +830,16 @@ void forward_kernel(ffStream_t stream, batch_size); } -void backward_kernel(cudaStream_t stream, - GenericTensorAccessorR const &output, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &weight_grad, - DataType output_data_type, - DataType input_data_type, - std::optional aggr, - int in_dim, - int out_dim, - int batch_size) { +void gpu_backward_kernel(cudaStream_t stream, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + DataType output_data_type, + DataType input_data_type, + std::optional aggr, + int in_dim, + int out_dim, + int batch_size) { DataTypeDispatch2{}(output_data_type, input_data_type, stream, diff --git a/lib/kernels/src/cuda/loss_function_kernels.cu b/lib/kernels/src/cuda/loss_function_kernels.cu index 2fccf4b48f..a98aa0ccda 100644 --- a/lib/kernels/src/cuda/loss_function_kernels.cu +++ b/lib/kernels/src/cuda/loss_function_kernels.cu @@ -14,7 +14,7 @@ */ #include "internal/device.h" -#include "kernels/loss_function_kernels.h" +#include "kernels/loss_function_kernels_gpu.h" namespace FlexFlow { @@ -56,7 +56,7 @@ __global__ void identity_loss_backward(float *loss_grad, } } -void sparse_categorical_crossentropy_loss_backward_kernel( +void sparse_categorical_crossentropy_loss_backward_gpu_kernel( cudaStream_t stream, float *logit_grad_ptr, float const *logit_ptr, @@ -83,13 +83,13 @@ void sparse_categorical_crossentropy_loss_backward_kernel( logit_grad_ptr, logit_grad_volume, 0, scale_factor * k); } -void categorical_crossentropy_loss_backward_kernel(cudaStream_t stream, - float *logit_grad_ptr, - float const *logit_ptr, - float const *label_ptr, - size_t logit_volume, - size_t logit_grad_volume, - float scale_factor) { +void categorical_crossentropy_loss_backward_gpu_kernel(cudaStream_t stream, + float *logit_grad_ptr, + float const *logit_ptr, + float const *label_ptr, + size_t logit_volume, + size_t logit_grad_volume, + float scale_factor) { // cudaStream_t stream; checkCUDA(get_legion_stream(&stream)); categorical_crossentropy_loss_backward<<tie() == other.tie(); -} - -bool MHAPerDeviceState::operator!=(MHAPerDeviceState const &other) const { - return this->tie() != other.tie(); -} - -std:: - tuple - MHAPerDeviceState::tie() const { - return std::tie(this->handle, - this->weightSize, - this->reserveSpaceSize, - this->attnDesc, - this->qDesc, - this->kDesc, - this->vDesc, - this->oDesc, - this->devQoSeqArray, - this->devKvSeqArray, - this->loWinIdx, - this->hiWinIdx, - this->reserveSpace); -} - -std::string format_as(MHAPerDeviceState const &x) { - return fmt::format("MHAPerDeviceState"); -} - -std::ostream &operator<<(std::ostream &s, MHAPerDeviceState const &x) { - return (s << fmt::to_string(x)); -} - -namespace Kernels { -namespace MultiHeadAttention { - -MHAPerDeviceState init_kernel(PerDeviceFFHandle const &handle, - Allocator &allocator, - int num_samples, - int num_heads, - int qSize, - int kSize, - int vSize, - int qProjSize, - int kProjSize, - int vProjSize, - int oProjSize, - int qoSeqLength, - int kvSeqLength, - bool add_bias_kv) { +MHAPerDeviceState gpu_init_kernel(PerDeviceFFHandle const &handle, + Allocator &allocator, + int num_samples, + int num_heads, + int qSize, + int kSize, + int vSize, + int qProjSize, + int kProjSize, + int vProjSize, + int oProjSize, + int qoSeqLength, + int kvSeqLength, + bool add_bias_kv) { cudaStream_t stream; ffAttnDescriptor_t attnDesc; ffSeqDataDescriptor_t qDesc; @@ -225,31 +188,33 @@ MHAPerDeviceState init_kernel(PerDeviceFFHandle const &handle, hiWinIdx[i] = kvSeqLength; } - MHAPerDeviceState per_device_state = {handle, - weightSize, - reserveSpaceSize, - attnDesc, - qDesc, - kDesc, - vDesc, - oDesc, - devQoSeqArray, - devKvSeqArray, - loWinIdx, - hiWinIdx, - reserveSpace, - allocator}; + MHAPerDeviceState per_device_state = MHAPerDeviceState{ + /*handle=*/handle, + /*weightSize=*/weightSize, + /*reserveSpaceSize=*/reserveSpaceSize, + /*attnDesc=*/attnDesc, + /*qDesc=*/qDesc, + /*kDesc=*/kDesc, + /*vDesc=*/vDesc, + /*oDesc=*/oDesc, + /*devQoSeqArray=*/devQoSeqArray, + /*devKvSeqArray=*/devKvSeqArray, + /*loWinIdx=*/loWinIdx, + /*hiWinIdx=*/hiWinIdx, + /*reserveSpace=*/reserveSpace, + /*allocator=*/allocator, + }; return per_device_state; } -void forward_kernel(cudaStream_t stream, - MHAPerDeviceState const &device_state, - float const *query_ptr, - float const *key_ptr, - float const *value_ptr, - float const *weight_ptr, - float *output_ptr) { +void gpu_forward_kernel(cudaStream_t stream, + MHAPerDeviceState const &device_state, + float const *query_ptr, + float const *key_ptr, + float const *value_ptr, + float const *weight_ptr, + float *output_ptr) { checkCUDNN(cudnnSetStream(device_state.handle.dnn, stream)); checkCUDNN(cudnnMultiHeadAttnForward(device_state.handle.dnn, @@ -276,17 +241,17 @@ void forward_kernel(cudaStream_t stream, device_state.reserveSpace)); } -void backward_kernel(cudaStream_t stream, - MHAPerDeviceState const &device_state, - float const *query_ptr, - float *query_grad_ptr, - float const *key_ptr, - float *key_grad_ptr, - float const *value_ptr, - float *value_grad_ptr, - float const *weight_ptr, - float *weight_grad_ptr, - float const *output_grad_ptr) { +void gpu_backward_kernel(cudaStream_t stream, + MHAPerDeviceState const &device_state, + float const *query_ptr, + float *query_grad_ptr, + float const *key_ptr, + float *key_grad_ptr, + float const *value_ptr, + float *value_grad_ptr, + float const *weight_ptr, + float *weight_grad_ptr, + float const *output_grad_ptr) { checkCUDNN(cudnnSetStream(device_state.handle.dnn, stream)); checkCUDNN(cudnnMultiHeadAttnBackwardData(device_state.handle.dnn, @@ -333,8 +298,8 @@ void backward_kernel(cudaStream_t stream, device_state.reserveSpace)); } -void cleanup_kernel(Allocator &allocator, - MHAPerDeviceState const &device_state) { +void gpu_cleanup_kernel(Allocator &allocator, + MHAPerDeviceState const &device_state) { free(device_state.loWinIdx); free(device_state.hiWinIdx); checkCUDNN(cudnnDestroyAttnDescriptor(device_state.attnDesc)); @@ -344,6 +309,4 @@ void cleanup_kernel(Allocator &allocator, checkCUDNN(cudnnDestroySeqDataDescriptor(device_state.oDesc)); } -} // namespace MultiHeadAttention -} // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow::Kernels::MultiHeadAttention diff --git a/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu b/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu index 348eed9f0c..39f5beea21 100644 --- a/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu +++ b/lib/kernels/src/cuda/ops/batch_matmul_kernels.cu @@ -14,24 +14,24 @@ */ #include "internal/device.h" -#include "kernels/batch_matmul_kernels.h" +#include "kernels/batch_matmul_kernels_gpu.h" namespace FlexFlow { namespace Kernels { namespace BatchMatmul { -void forward_kernel(cudaStream_t stream, - PerDeviceFFHandle const &handle, - float *output_ptr, - float const *a_input_ptr, - float const *b_input_ptr, - int m, - int n, - int k, - int batch, - int a_seq_length_dim, - int b_seq_length_dim, - int seq_length) { +void gpu_forward_kernel(cudaStream_t stream, + PerDeviceFFHandle const &handle, + float *output_ptr, + float const *a_input_ptr, + float const *b_input_ptr, + int m, + int n, + int k, + int batch, + int a_seq_length_dim, + int b_seq_length_dim, + int seq_length) { checkCUBLAS(cublasSetStream(handle.blas, stream)); checkCUDNN(cudnnSetStream(handle.dnn, stream)); int lda = k; @@ -83,18 +83,18 @@ void forward_kernel(cudaStream_t stream, batch)); } -void backward_kernel(cudaStream_t stream, - PerDeviceFFHandle const &handle, - float const *o_ptr, - float const *o_grad_ptr, - float const *a_ptr, - float *a_grad_ptr, - float const *b_ptr, - float *b_grad_ptr, - int m, - int n, - int k, - int batch) { +void gpu_backward_kernel(cudaStream_t stream, + PerDeviceFFHandle const &handle, + float const *o_ptr, + float const *o_grad_ptr, + float const *a_ptr, + float *a_grad_ptr, + float const *b_ptr, + float *b_grad_ptr, + int m, + int n, + int k, + int batch) { checkCUBLAS(cublasSetStream(handle.blas, stream)); checkCUDNN(cudnnSetStream(handle.dnn, stream)); diff --git a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu index ceb3a1b3d9..26234fd6e0 100644 --- a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu +++ b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu @@ -23,12 +23,12 @@ namespace FlexFlow { namespace Kernels { namespace BatchNorm { -void forward_kernel(cudaStream_t stream, - BatchNormPerDeviceState const &m, - float const *input_ptr, - float *output_ptr, - float const *scale_ptr, - float const *bias_ptr) { +void gpu_forward_kernel(cudaStream_t stream, + BatchNormPerDeviceState const &m, + float const *input_ptr, + float *output_ptr, + float const *scale_ptr, + float const *bias_ptr) { checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); float alpha = 1.0f, beta = 0.0f; @@ -51,16 +51,16 @@ void forward_kernel(cudaStream_t stream, m.saveVar)); } -void backward_kernel(cudaStream_t stream, - BatchNormPerDeviceState const &m, - float const *output_ptr, - float *output_grad_ptr, - float const *input_ptr, - float *input_grad_ptr, - float const *scale_ptr, - float *scale_grad_ptr, - float *bias_grad_ptr, - size_t numElements) { +void gpu_backward_kernel(cudaStream_t stream, + BatchNormPerDeviceState const &m, + float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, + float const *scale_ptr, + float *scale_grad_ptr, + float *bias_grad_ptr, + size_t numElements) { checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); float alpha = 1.0f; @@ -89,14 +89,14 @@ void backward_kernel(cudaStream_t stream, m.saveVar)); } -BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle, - Allocator allocator, - float *runningMean, - int output_n, - int output_c, - int output_h, - int output_w, - bool relu) { +BatchNormPerDeviceState gpu_init_kernel(PerDeviceFFHandle const &handle, + Allocator &allocator, + float *runningMean, + int output_n, + int output_c, + int output_h, + int output_w, + bool relu) { ffTensorDescriptor_t inputTensor; ffTensorDescriptor_t outputTensor; ffTensorDescriptor_t biasTensor; @@ -167,19 +167,14 @@ BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle, return per_device_state; } -void cleanup_kernel(Allocator allocator, - ffTensorDescriptor_t inputTensor, - ffTensorDescriptor_t biasTensor, - ffTensorDescriptor_t outputTensor, - ffActivationDescriptor_t actiDesc, - bool relu, - float *runningMean) { - allocator.deallocate(runningMean); - checkCUDNN(cudnnDestroyTensorDescriptor(inputTensor)); - checkCUDNN(cudnnDestroyTensorDescriptor(biasTensor)); - checkCUDNN(cudnnDestroyTensorDescriptor(outputTensor)); - if (relu) { - checkCUDNN(cudnnDestroyActivationDescriptor(actiDesc)); +void gpu_cleanup_kernel(Allocator &allocator, + BatchNormPerDeviceState &per_device_state) { + allocator.deallocate(per_device_state.runningMean); + checkCUDNN(cudnnDestroyTensorDescriptor(per_device_state.inputTensor)); + checkCUDNN(cudnnDestroyTensorDescriptor(per_device_state.biasTensor)); + checkCUDNN(cudnnDestroyTensorDescriptor(per_device_state.outputTensor)); + if (per_device_state.relu) { + checkCUDNN(cudnnDestroyActivationDescriptor(per_device_state.actiDesc)); } } diff --git a/lib/kernels/src/cuda/ops/cast_kernels.cu b/lib/kernels/src/cuda/ops/cast_kernels.cu index 3de6de9d5e..7e38c7af40 100644 --- a/lib/kernels/src/cuda/ops/cast_kernels.cu +++ b/lib/kernels/src/cuda/ops/cast_kernels.cu @@ -14,7 +14,7 @@ */ #include "internal/device.h" -#include "kernels/cast_kernels.h" +#include "kernels/cast_kernels_gpu.h" #include "kernels/datatype_dispatch.h" namespace FlexFlow { @@ -41,7 +41,7 @@ struct ForwardKernel { void operator()(ffStream_t stream, GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { - size_t volume = input.shape.num_elements().int_from_positive_int(); + size_t volume = get_num_elements(input.shape.dims).int_from_positive_int(); cast_forward<<>>( input.get(), output.get(), volume); } @@ -52,24 +52,24 @@ struct BackwardKernel { void operator()(ffStream_t stream, GenericTensorAccessorR const &output, GenericTensorAccessorW const &input) { - size_t volume = output.shape.num_elements().int_from_positive_int(); + size_t volume = get_num_elements(output.shape.dims).int_from_positive_int(); cast_backward<<>>( output.get(), input.get(), volume, cast_to(1.0f)); } }; -void forward_kernel(ffStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { +void gpu_forward_kernel(ffStream_t stream, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { DataTypeDispatch2{}( - input.data_type, output.data_type, stream, input, output); + input.shape.data_type, output.shape.data_type, stream, input, output); } -void backward_kernel(ffStream_t stream, - GenericTensorAccessorR const &output, - GenericTensorAccessorW const &input) { +void gpu_backward_kernel(ffStream_t stream, + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { DataTypeDispatch2{}( - output.data_type, input.data_type, stream, output, input); + output.shape.data_type, input.shape.data_type, stream, output, input); } } // namespace Cast diff --git a/lib/kernels/src/cuda/ops/combine_kernels.cu b/lib/kernels/src/cuda/ops/combine_kernels.cu deleted file mode 100644 index f091a69b71..0000000000 --- a/lib/kernels/src/cuda/ops/combine_kernels.cu +++ /dev/null @@ -1,68 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "internal/device.h" -#include "kernels/accessor.h" -#include "kernels/combine_kernels.h" -#include "kernels/datatype_dispatch.h" - -namespace FlexFlow { -namespace Kernels { -namespace Combine { - -template -struct ForwardKernel { - void operator()(ffStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - checkCUDA( - cudaMemcpyAsync(output.get
(), - input.get
(), - input.shape.num_elements().int_from_positive_int() * - size_of_datatype(DT).int_from_positive_int(), - cudaMemcpyDeviceToDevice, - stream)); - } -}; - -template -struct BackwardKernel { - void operator()(ffStream_t stream, - GenericTensorAccessorR const &output_grad, - GenericTensorAccessorW const &input_grad) { - size_t num_elements = - output_grad.shape.num_elements().int_from_positive_int(); - add_kernel> - <<>>( - input_grad.get
(), output_grad.get
(), num_elements); - } -}; - -void forward_kernel(ffStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - DataTypeDispatch1{}(input.data_type, stream, input, output); -} - -void backward_kernel(ffStream_t stream, - GenericTensorAccessorR const &output_grad, - GenericTensorAccessorW const &input_grad) { - DataTypeDispatch1{}( - input_grad.data_type, stream, output_grad, input_grad); -} - -} // namespace Combine -} // namespace Kernels -} // namespace FlexFlow diff --git a/lib/kernels/src/cuda/ops/concat_kernels.cu b/lib/kernels/src/cuda/ops/concat_kernels.cu index e7f88bc258..667a7c0b74 100644 --- a/lib/kernels/src/cuda/ops/concat_kernels.cu +++ b/lib/kernels/src/cuda/ops/concat_kernels.cu @@ -14,32 +14,26 @@ */ #include "internal/device.h" -#include "kernels/concat_kernels.h" +#include "kernels/concat_kernels_gpu.h" #include namespace FlexFlow::Kernels::Concat { -void calc_blk_size(size_t &num_blocks, - size_t &blk_size, - ArrayShape const &shape, - ff_dim_t axis) { - legion_dim_t legion_axis = legion_dim_from_ff_dim(axis, shape.num_dims()); - assert(legion_axis.value < shape.num_dims()); - if (legion_axis.value == 0_n) { - legion_axis.value = 1_n; - } - blk_size = shape.sub_shape(legion_dim_t{0_n}, legion_axis) - .num_elements() +static void calc_blk_size(size_t &num_blocks, + size_t &blk_size, + TensorShape const &shape, + ff_dim_t axis) { + blk_size = get_num_elements(slice_tensor_dims(shape.dims, axis, std::nullopt)) .int_from_positive_int(); - num_blocks = shape.sub_shape(legion_axis, std::nullopt) - .num_elements() - .int_from_positive_int(); + num_blocks = + get_num_elements(slice_tensor_dims(shape.dims, ff_dim_t{0_n}, axis)) + .int_from_positive_int(); } -void forward_kernel(cudaStream_t stream, - GenericTensorAccessorW const &output, - std::vector const &inputs, - ff_dim_t axis) { +void gpu_forward_kernel(cudaStream_t stream, + GenericTensorAccessorW const &output, + std::vector const &inputs, + ff_dim_t axis) { assert(inputs.size() <= MAX_NUM_INPUTS); size_t num_blocks = 1, output_blk_size = 1; calc_blk_size(num_blocks, output_blk_size, output.shape, axis); @@ -68,10 +62,10 @@ void forward_kernel(cudaStream_t stream, } } -void backward_kernel(cudaStream_t stream, - GenericTensorAccessorR const &output_grad, - std::vector const &input_grads, - ff_dim_t axis) { +void gpu_backward_kernel(cudaStream_t stream, + GenericTensorAccessorR const &output_grad, + std::vector const &input_grads, + ff_dim_t axis) { assert(input_grads.size() <= MAX_NUM_INPUTS); size_t num_blocks = 1, output_blk_size = 1; calc_blk_size(num_blocks, output_blk_size, output_grad.shape, axis); diff --git a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu index 6e446008ed..92046b30ae 100644 --- a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu +++ b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu @@ -113,19 +113,20 @@ cudnnConvolutionBwdFilterAlgo_t selectConvolutionBackwardFilterAlgorithm( return perfResults[0].algo; } -Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, - std::optional activation, - int kernel_h, - int kernel_w, - int groups, - int pad_h, - int pad_w, - int stride_h, - int stride_w, - GenericTensorAccessorW const &input, - GenericTensorAccessorW const &output, - float const *filter_ptr, - float *filter_grad_ptr) { +Conv2DPerDeviceState + gpu_init_kernel(PerDeviceFFHandle const &handle, + std::optional const &activation, + int kernel_h, + int kernel_w, + int groups, + int pad_h, + int pad_w, + int stride_h, + int stride_w, + GenericTensorAccessorW const &input, + GenericTensorAccessorW const &output, + float const *filter_ptr, + float *filter_grad_ptr) { ffTensorDescriptor_t inputTensor; ffTensorDescriptor_t biasTensor; @@ -137,15 +138,23 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, ffConvolutionBwdFilterAlgo_t bwdFilterAlgo; ffConvolutionBwdDataAlgo_t bwdDataAlgo; - int input_w = input.shape.at(legion_dim_t(0_n)).int_from_positive_int(); - int input_h = input.shape.at(legion_dim_t(1_n)).int_from_positive_int(); - int input_c = input.shape.at(legion_dim_t(2_n)).int_from_positive_int(); - int input_n = input.shape.at(legion_dim_t(3_n)).int_from_positive_int(); + int input_w = + dim_at_idx(input.shape.dims, legion_dim_t{0_n}).int_from_positive_int(); + int input_h = + dim_at_idx(input.shape.dims, legion_dim_t{1_n}).int_from_positive_int(); + int input_c = + dim_at_idx(input.shape.dims, legion_dim_t{2_n}).int_from_positive_int(); + int input_n = + dim_at_idx(input.shape.dims, legion_dim_t{3_n}).int_from_positive_int(); - int output_w = output.shape.at(legion_dim_t(0_n)).int_from_positive_int(); - int output_h = output.shape.at(legion_dim_t(1_n)).int_from_positive_int(); - int output_c = output.shape.at(legion_dim_t(2_n)).int_from_positive_int(); - int output_n = output.shape.at(legion_dim_t(3_n)).int_from_positive_int(); + int output_w = + dim_at_idx(output.shape.dims, legion_dim_t{0_n}).int_from_positive_int(); + int output_h = + dim_at_idx(output.shape.dims, legion_dim_t{1_n}).int_from_positive_int(); + int output_c = + dim_at_idx(output.shape.dims, legion_dim_t{2_n}).int_from_positive_int(); + int output_n = + dim_at_idx(output.shape.dims, legion_dim_t{3_n}).int_from_positive_int(); checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); checkCUDNN(cudnnCreateTensorDescriptor(&biasTensor)); @@ -154,13 +163,7 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc)); checkCUDNN(cudnnCreateActivationDescriptor(&actiDesc)); - checkCUDNN(cudnnSetTensor4dDescriptor(inputTensor, - CUDNN_TENSOR_NCHW, - CUDNN_DATA_FLOAT, - input_n, - input_c, - input_h, - input_w)); + checkCUDNN(cudnnSetTensorDescriptorFromTensorShape(inputTensor, input.shape)); checkCUDNN(cudnnSetTensor4dDescriptor( biasTensor, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, output_c, 1, 1)); @@ -253,26 +256,28 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle, actiDesc, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0)); } - Conv2DPerDeviceState per_device_state = {handle, - inputTensor, - biasTensor, - outputTensor, - filterDesc, - actiDesc, - convDesc, - fwdAlgo, - bwdFilterAlgo, - bwdDataAlgo}; + Conv2DPerDeviceState per_device_state = Conv2DPerDeviceState{ + handle, + inputTensor, + biasTensor, + outputTensor, + filterDesc, + actiDesc, + convDesc, + fwdAlgo, + bwdFilterAlgo, + bwdDataAlgo, + }; return per_device_state; } -void forward_kernel(ffStream_t stream, - Conv2DPerDeviceState const &m, - float const *input_ptr, - float *output_ptr, - float const *filter_ptr, - float const *bias_ptr, - std::optional activation) { +void gpu_forward_kernel(ffStream_t stream, + Conv2DPerDeviceState const &m, + float const *input_ptr, + float *output_ptr, + float const *filter_ptr, + float const *bias_ptr, + std::optional activation) { checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); float alpha = 1.0f, beta = 0.0f; @@ -311,16 +316,16 @@ void forward_kernel(ffStream_t stream, } } -void backward_kernel(ffStream_t stream, - Conv2DPerDeviceState const &m, - float const *output_ptr, - float *output_grad_ptr, - float const *input_ptr, - float *input_grad_ptr, - float const *filter_ptr, - float *filter_grad_ptr, - float *bias_grad_ptr, - std::optional activation) { +void gpu_backward_kernel(ffStream_t stream, + Conv2DPerDeviceState const &m, + float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, + float const *filter_ptr, + float *filter_grad_ptr, + float *bias_grad_ptr, + std::optional activation) { checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); float alpha = 1.0f; @@ -386,6 +391,10 @@ void backward_kernel(ffStream_t stream, } } +void gpu_cleanup_kernel(Conv2DPerDeviceState &per_device_state) { + NOT_IMPLEMENTED(); +} + } // namespace Conv2D } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/src/cuda/ops/dropout_kernels.cu b/lib/kernels/src/cuda/ops/dropout_kernels.cu index c5fa56bc78..fc37696c24 100644 --- a/lib/kernels/src/cuda/ops/dropout_kernels.cu +++ b/lib/kernels/src/cuda/ops/dropout_kernels.cu @@ -21,11 +21,11 @@ namespace FlexFlow { namespace Kernels { namespace Dropout { -DropoutPerDeviceState init_kernel(PerDeviceFFHandle handle, - float rate, - unsigned long long seed, - ArrayShape const &output_shape, - Allocator allocator) { +DropoutPerDeviceState gpu_init_kernel(PerDeviceFFHandle const &handle, + float rate, + unsigned long long seed, + TensorShape const &output_shape, + Allocator &allocator) { ffTensorDescriptor_t inputTensor; ffTensorDescriptor_t outputTensor; ffDropoutDescriptor_t dropoutDesc; @@ -37,9 +37,10 @@ DropoutPerDeviceState init_kernel(PerDeviceFFHandle handle, checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); checkCUDNN(cudnnCreateDropoutDescriptor(&dropoutDesc)); checkCUDNN(cudnnDropoutGetStatesSize(handle.dnn, &(dropoutStateSize))); - checkCUDNN(cudnnSetTensorDescriptorFromArrayShape(inputTensor, output_shape)); checkCUDNN( - cudnnSetTensorDescriptorFromArrayShape(outputTensor, output_shape)); + cudnnSetTensorDescriptorFromTensorShape(inputTensor, output_shape)); + checkCUDNN( + cudnnSetTensorDescriptorFromTensorShape(outputTensor, output_shape)); checkCUDNN( cudnnDropoutGetReserveSpaceSize(outputTensor, &(reserveSpaceSize))); { @@ -50,21 +51,23 @@ DropoutPerDeviceState init_kernel(PerDeviceFFHandle handle, } checkCUDNN(cudnnSetDropoutDescriptor( dropoutDesc, handle.dnn, rate, dropoutStates, dropoutStateSize, seed)); - DropoutPerDeviceState per_device_state = {handle, - inputTensor, - outputTensor, - dropoutDesc, - reserveSpace, - dropoutStates, - reserveSpaceSize, - dropoutStateSize}; + DropoutPerDeviceState per_device_state = DropoutPerDeviceState{ + /*handle=*/handle, + /*inputTensor=*/inputTensor, + /*outputTensor=*/outputTensor, + /*dropoutDesc=*/dropoutDesc, + /*reserveSpace=*/reserveSpace, + /*dropoutStates=*/dropoutStates, + /*reserveSpaceSize=*/reserveSpaceSize, + /*dropoutStateSize=*/dropoutStateSize, + }; return per_device_state; } -void forward_kernel(cudaStream_t stream, - DropoutPerDeviceState const &m, - float const *input_ptr, - float *output_ptr) { +void gpu_forward_kernel(cudaStream_t stream, + DropoutPerDeviceState const &m, + float const *input_ptr, + float *output_ptr) { checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); checkCUDNN(cudnnDropoutForward(m.handle.dnn, @@ -77,10 +80,10 @@ void forward_kernel(cudaStream_t stream, m.reserveSpaceSize)); } -void backward_kernel(cudaStream_t stream, - DropoutPerDeviceState const &m, - float const *output_grad_ptr, - float *input_grad_ptr) { +void gpu_backward_kernel(cudaStream_t stream, + DropoutPerDeviceState const &m, + float const *output_grad_ptr, + float *input_grad_ptr) { checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); checkCUDNN(cudnnDropoutBackward(m.handle.dnn, @@ -93,15 +96,12 @@ void backward_kernel(cudaStream_t stream, m.reserveSpaceSize)); } -void cleanup_kernel(Allocator allocator, - ffTensorDescriptor_t inputTensor, - ffTensorDescriptor_t outputTensor, - ffDropoutDescriptor_t dropoutDesc, - void *dropoutStates) { - allocator.deallocate(dropoutStates); - checkCUDNN(cudnnDestroyTensorDescriptor(inputTensor)); - checkCUDNN(cudnnDestroyTensorDescriptor(outputTensor)); - checkCUDNN(cudnnDestroyDropoutDescriptor(dropoutDesc)); +void gpu_cleanup_kernel(Allocator &allocator, + DropoutPerDeviceState const &per_device_state) { + allocator.deallocate(per_device_state.dropoutStates); + checkCUDNN(cudnnDestroyTensorDescriptor(per_device_state.inputTensor)); + checkCUDNN(cudnnDestroyTensorDescriptor(per_device_state.outputTensor)); + checkCUDNN(cudnnDestroyDropoutDescriptor(per_device_state.dropoutDesc)); } } // namespace Dropout diff --git a/lib/kernels/src/cuda/ops/element_binary_kernels.cu b/lib/kernels/src/cuda/ops/element_binary_kernels.cu index 3a4a77b3dd..7e13486429 100644 --- a/lib/kernels/src/cuda/ops/element_binary_kernels.cu +++ b/lib/kernels/src/cuda/ops/element_binary_kernels.cu @@ -14,7 +14,7 @@ */ #include "internal/device.h" -#include "kernels/element_binary_kernels.h" +#include "kernels/element_binary_kernels_gpu.h" #include "kernels/ff_handle.h" #include "op-attrs/datatype.h" #include "op-attrs/operator_type.h" @@ -79,13 +79,13 @@ __global__ void elewise_binary_backward_kernel(size_t volume, } } -ElementBinaryPerDeviceState init_kernel(PerDeviceFFHandle handle, - OperatorType op_type, - bool should_broadcast_lhs, - bool should_broadcast_rhs, - ArrayShape lhs_shape, - ArrayShape rhs_shape, - ArrayShape output_shape) { +ElementBinaryPerDeviceState gpu_init_kernel(PerDeviceFFHandle handle, + OperatorType op_type, + bool should_broadcast_lhs, + bool should_broadcast_rhs, + TensorShape const &lhs_shape, + TensorShape const &rhs_shape, + TensorShape const &output_shape) { ffTensorDescriptor_t inputLHSTensor; ffTensorDescriptor_t inputRHSTensor; ffTensorDescriptor_t outputTensor; @@ -124,28 +124,32 @@ ElementBinaryPerDeviceState init_kernel(PerDeviceFFHandle handle, CUDNN_PROPAGATE_NAN, CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES)); - checkCUDNN(cudnnSetTensorDescriptorFromArrayShape(inputLHSTensor, lhs_shape)); - checkCUDNN(cudnnSetTensorDescriptorFromArrayShape(inputRHSTensor, rhs_shape)); checkCUDNN( - cudnnSetTensorDescriptorFromArrayShape(outputTensor, output_shape)); + cudnnSetTensorDescriptorFromTensorShape(inputLHSTensor, lhs_shape)); + checkCUDNN( + cudnnSetTensorDescriptorFromTensorShape(inputRHSTensor, rhs_shape)); + checkCUDNN( + cudnnSetTensorDescriptorFromTensorShape(outputTensor, output_shape)); - ElementBinaryPerDeviceState per_device_state = {handle, - inputLHSTensor, - inputRHSTensor, - outputTensor, - opDesc, - reduceAddDesc}; + ElementBinaryPerDeviceState per_device_state = ElementBinaryPerDeviceState{ + /*handle=*/handle, + /*inputLHSTensor=*/inputLHSTensor, + /*inputRHSTensor=*/inputRHSTensor, + /*outputTensor=*/outputTensor, + /*opDesc=*/opDesc, + /*reduceAddDesc=*/reduceAddDesc, + }; return per_device_state; } -void forward_kernel(cudaStream_t stream, - ElementBinaryPerDeviceState const &m, - float const *lhs_ptr, - float const *rhs_ptr, - float *out_ptr, - OperatorType op_type, - bool broadcast_inputLHS, - PerDeviceFFHandle handle) { +void gpu_forward_kernel(cudaStream_t stream, + ElementBinaryPerDeviceState const &m, + float const *lhs_ptr, + float const *rhs_ptr, + float *out_ptr, + OperatorType op_type, + bool broadcast_inputLHS, + PerDeviceFFHandle handle) { checkCUBLAS(cublasSetStream(handle.blas, stream)); checkCUDNN(cudnnSetStream(handle.dnn, stream)); float alpha1 = 1.0f, alpha2 = 1.0f, beta = 0.0f; @@ -242,17 +246,17 @@ void forward_kernel(cudaStream_t stream, } } -void backward_kernel(cudaStream_t stream, - ElementBinaryPerDeviceState const &m, - float const *out_grad_ptr, - float const *lhs_ptr, - float const *rhs_ptr, - float *lhs_grad_ptr, - float *rhs_grad_ptr, - OperatorType op_type, - bool broadcast_inputLHS, - bool broadcast_inputRHS, - PerDeviceFFHandle handle) { +void gpu_backward_kernel(cudaStream_t stream, + ElementBinaryPerDeviceState const &m, + float const *out_grad_ptr, + float const *lhs_ptr, + float const *rhs_ptr, + float *lhs_grad_ptr, + float *rhs_grad_ptr, + OperatorType op_type, + bool broadcast_inputLHS, + bool broadcast_inputRHS, + PerDeviceFFHandle handle) { checkCUBLAS(cublasSetStream(handle.blas, stream)); checkCUDNN(cudnnSetStream(handle.dnn, stream)); @@ -421,6 +425,10 @@ void backward_kernel(cudaStream_t stream, } } +void gpu_cleanup_kernel(ElementBinaryPerDeviceState const &per_device_state) { + NOT_IMPLEMENTED(); +} + } // namespace ElementBinary } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/src/cuda/ops/element_unary_kernels.cu b/lib/kernels/src/cuda/ops/element_unary_kernels.cu index 21ac95c204..8fdc3ca8ee 100644 --- a/lib/kernels/src/cuda/ops/element_unary_kernels.cu +++ b/lib/kernels/src/cuda/ops/element_unary_kernels.cu @@ -15,7 +15,7 @@ #include "internal/device.h" #include "kernels/datatype_dispatch.h" -#include "kernels/element_unary_kernels.h" +#include "kernels/element_unary_kernels_gpu.h" #include "op-attrs/get_op_type.h" #include @@ -48,9 +48,10 @@ static bool use_scalar(OperatorType op_type) { } } -static ElementUnaryPerDeviceState init_kernel(ArrayShape const &input_shape, - ArrayShape const &output_shape, - OperatorType op_type) { +static ElementUnaryPerDeviceState + gpu_init_kernel(TensorShape const &input_shape, + TensorShape const &output_shape, + OperatorType op_type) { ffTensorDescriptor_t inputTensor; ffTensorDescriptor_t outputTensor; @@ -81,18 +82,22 @@ static ElementUnaryPerDeviceState init_kernel(ArrayShape const &input_shape, checkCUDNN( cudnnSetActivationDescriptor(actiDesc, mode, CUDNN_PROPAGATE_NAN, 0.0)); checkCUDNN( - cudnnSetTensorDescriptorFromArrayShape(inputTensor, input_shape)); + cudnnSetTensorDescriptorFromTensorShape(inputTensor, input_shape)); checkCUDNN( - cudnnSetTensorDescriptorFromArrayShape(outputTensor, output_shape)); + cudnnSetTensorDescriptorFromTensorShape(outputTensor, output_shape)); } - return {inputTensor, outputTensor, actiDesc}; + return ElementUnaryPerDeviceState{ + /*inputTensor=*/inputTensor, + /*outputTensor=*/outputTensor, + /*actiDesc=*/actiDesc, + }; } -ElementUnaryPerDeviceState init_kernel(ArrayShape const &input_shape, - ArrayShape const &output_shape, - ElementUnaryAttrs const &attrs) { - return init_kernel(input_shape, output_shape, get_op_type(attrs)); +ElementUnaryPerDeviceState gpu_init_kernel(TensorShape const &input_shape, + TensorShape const &output_shape, + ElementUnaryAttrs const &attrs) { + return gpu_init_kernel(input_shape, output_shape, get_op_type(attrs)); } template @@ -254,6 +259,10 @@ struct ForwardKernel { GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) const { checkCUDNN(cudnnSetStream(handle.dnn, stream)); + + size_t num_elements = + get_num_elements(input.shape.dims).int_from_positive_int(); + if (use_cudnn(op_type)) { float alpha = 1.0f, beta = 0.0f; checkCUDNN(cudnnActivationForward(handle.dnn, @@ -266,7 +275,6 @@ struct ForwardKernel { output.get())); } else if (use_scalar(op_type)) { assert(scalar.has_value()); - size_t num_elements = input.shape.num_elements().int_from_positive_int(); elewise_scalar_unary_forward_kernel> <<>>( num_elements, @@ -275,7 +283,6 @@ struct ForwardKernel { input.get(), output.get()); } else { - size_t num_elements = input.shape.num_elements().int_from_positive_int(); elewise_unary_forward_kernel> <<>>( num_elements, op_type, input.get(), output.get()); @@ -295,6 +302,8 @@ struct BackwardKernel { GenericTensorAccessorR const &input, GenericTensorAccessorW const &input_grad) { checkCUDNN(cudnnSetStream(handle.dnn, stream)); + size_t num_elements = + get_num_elements(input.shape.dims).int_from_positive_int(); if (use_cudnn(op_type)) { float alpha = 1.0f; @@ -312,7 +321,6 @@ struct BackwardKernel { input_grad.get())); } else if (use_scalar(op_type)) { assert(scalar.has_value()); - size_t num_elements = input.shape.num_elements().int_from_positive_int(); elewise_scalar_unary_backward_kernel> <<>>( num_elements, @@ -323,7 +331,6 @@ struct BackwardKernel { input.get(), input_grad.get()); } else { - size_t num_elements = input.shape.num_elements().int_from_positive_int(); elewise_unary_backward_kernel> <<>>( num_elements, @@ -336,13 +343,13 @@ struct BackwardKernel { } }; -void forward_kernel(ffStream_t stream, - ElementUnaryPerDeviceState const &device_state, - ElementUnaryAttrs const &attrs, - PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - DataTypeDispatch1{}(input.data_type, +void gpu_forward_kernel(ffStream_t stream, + ElementUnaryPerDeviceState const &device_state, + ElementUnaryAttrs const &attrs, + PerDeviceFFHandle const &handle, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + DataTypeDispatch1{}(input.shape.data_type, stream, device_state, get_op_type(attrs), @@ -352,15 +359,15 @@ void forward_kernel(ffStream_t stream, output); } -void backward_kernel(ffStream_t stream, - ElementUnaryPerDeviceState const &device_state, - ElementUnaryAttrs const &attrs, - PerDeviceFFHandle const &handle, - GenericTensorAccessorR const &output, - GenericTensorAccessorR const &output_grad, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &input_grad) { - DataTypeDispatch1{}(input.data_type, +void gpu_backward_kernel(ffStream_t stream, + ElementUnaryPerDeviceState const &device_state, + ElementUnaryAttrs const &attrs, + PerDeviceFFHandle const &handle, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad) { + DataTypeDispatch1{}(input.shape.data_type, stream, device_state, get_op_type(attrs), @@ -372,6 +379,10 @@ void backward_kernel(ffStream_t stream, input_grad); } +void gpu_cleanup_kernel(ElementUnaryPerDeviceState &per_device_state) { + NOT_IMPLEMENTED(); +} + } // namespace ElementUnary } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/src/cuda/ops/flat_kernels.cu b/lib/kernels/src/cuda/ops/flat_kernels.cu index 9dee095071..e3495750c2 100644 --- a/lib/kernels/src/cuda/ops/flat_kernels.cu +++ b/lib/kernels/src/cuda/ops/flat_kernels.cu @@ -15,37 +15,38 @@ #include "internal/device.h" #include "kernels/accessor.h" -#include "kernels/flat_kernels.h" +#include "kernels/flat_kernels_gpu.h" +#include "op-attrs/tensor_shape.h" namespace FlexFlow { namespace Kernels { namespace Flat { -void forward_kernel(cudaStream_t stream, - GenericTensorAccessorR input, - float *output_ptr) { +void gpu_forward_kernel(cudaStream_t stream, + GenericTensorAccessorR const &input, + float *output_ptr) { - checkCUDA(cudaMemcpyAsync(output_ptr, - input.get_float_ptr(), - input.shape.num_elements().int_from_positive_int() * - sizeof(float), - cudaMemcpyDeviceToDevice, - stream)); + checkCUDA(cudaMemcpyAsync( + output_ptr, + input.get_float_ptr(), + get_size_in_bytes(input.shape).unwrap_num_bytes().unwrap_nonnegative(), + cudaMemcpyDeviceToDevice, + stream)); } -void backward_kernel(cudaStream_t stream, - GenericTensorAccessorR input, - float const *output_grad_ptr, - float *input_grad_ptr) { +void gpu_backward_kernel(cudaStream_t stream, + GenericTensorAccessorR const &input, + float const *output_grad_ptr, + float *input_grad_ptr) { float alpha = 1.0f; apply_add_with_scale - <<>>(input_grad_ptr, output_grad_ptr, - input.shape.num_elements().int_from_positive_int(), + get_num_elements(input.shape.dims).int_from_positive_int(), alpha); } diff --git a/lib/kernels/src/cuda/ops/gather_kernels.cu b/lib/kernels/src/cuda/ops/gather_kernels.cu index e251a57f8a..7b173fdd5e 100644 --- a/lib/kernels/src/cuda/ops/gather_kernels.cu +++ b/lib/kernels/src/cuda/ops/gather_kernels.cu @@ -16,7 +16,8 @@ #include "internal/device.h" #include "kernels/datatype_dispatch.h" #include "kernels/device.h" -#include "kernels/gather_kernels.h" +#include "kernels/gather_kernels_gpu.h" +#include "op-attrs/ff_dim_t.h" namespace FlexFlow::Kernels::Gather { @@ -117,72 +118,84 @@ struct BackwardKernel { } }; -void forward_kernel(ffStream_t stream, - GatherPerDeviceState const &m, - GenericTensorAccessorR const &input, - GenericTensorAccessorR const &index, - GenericTensorAccessorW const &output) { +GatherPerDeviceState gpu_init_kernel(PerDeviceFFHandle const &handle, + ff_dim_t dim) { + return GatherPerDeviceState{ + /*handle=*/handle, + /*dim=*/dim, + }; +} + +void gpu_forward_kernel(ffStream_t stream, + GatherPerDeviceState const &m, + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &index, + GenericTensorAccessorW const &output) { checkCUDA(get_legion_stream(&stream)); - coord_t stride = - output.shape - .sub_shape(legion_dim_t{0_n}, add_to_legion_dim(m.legion_dim, 1)) - .num_elements() - .int_from_positive_int(); - if (m.legion_dim.value == 0_n) { + + std::optional stride = std::nullopt; + if (m.dim.value == 0_n) { stride = 1; + } else { + stride = get_num_elements(slice_tensor_dims(output.shape.dims, + add_to_ff_dim(m.dim, -1), + std::nullopt)) + .int_from_positive_int(); } coord_t output_dim_size = - output.shape.at(m.legion_dim).int_from_positive_int(); - coord_t input_dim_size = input.shape.at(m.legion_dim).int_from_positive_int(); + dim_at_idx(output.shape.dims, m.dim).int_from_positive_int(); + coord_t input_dim_size = + dim_at_idx(input.shape.dims, m.dim).int_from_positive_int(); - assert(index.data_type == DataType::INT32 || - index.data_type == DataType::INT64); + assert(index.shape.data_type == DataType::INT32 || + index.shape.data_type == DataType::INT64); DataTypeDispatch1{}( - index.data_type, + index.shape.data_type, stream, input, index, output, - output.shape.num_elements().int_from_positive_int(), - stride, + get_num_elements(output.shape.dims).int_from_positive_int(), + stride.value(), input_dim_size, output_dim_size); } -void backward_kernel(ffStream_t stream, - GatherPerDeviceState const &m, - GenericTensorAccessorR const &output_grad, - GenericTensorAccessorR const &index, - GenericTensorAccessorW const &input_grad) { +void gpu_backward_kernel(ffStream_t stream, + GatherPerDeviceState const &m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &index, + GenericTensorAccessorW const &input_grad) { checkCUDA(get_legion_stream(&stream)); - coord_t stride = - output_grad.shape - .sub_shape(legion_dim_t{0_n}, add_to_legion_dim(m.legion_dim, 1)) - .num_elements() - .int_from_positive_int(); - if (m.legion_dim.value == 0_n) { + std::optional stride = std::nullopt; + if (m.dim.value == 0_n) { stride = 1; + } else { + stride = get_num_elements(slice_tensor_dims(output_grad.shape.dims, + add_to_ff_dim(m.dim, -1), + std::nullopt)) + .int_from_positive_int(); } coord_t output_dim_size = - output_grad.shape.at(m.legion_dim).int_from_positive_int(); + dim_at_idx(output_grad.shape.dims, m.dim).int_from_positive_int(); coord_t input_dim_size = - input_grad.shape.at(m.legion_dim).int_from_positive_int(); + dim_at_idx(input_grad.shape.dims, m.dim).int_from_positive_int(); - assert(index.data_type == DataType::INT32 || - index.data_type == DataType::INT64); + assert(index.shape.data_type == DataType::INT32 || + index.shape.data_type == DataType::INT64); DataTypeDispatch1{}( - index.data_type, + index.shape.data_type, stream, output_grad, index, input_grad, - output_grad.shape.num_elements().int_from_positive_int(), - stride, + get_num_elements(output_grad.shape.dims).int_from_positive_int(), + stride.value(), input_dim_size, output_dim_size); } diff --git a/lib/kernels/src/cuda/ops/layer_norm_kernels.cu b/lib/kernels/src/cuda/ops/layer_norm_kernels.cu index 40c3e79e41..31f26cca02 100644 --- a/lib/kernels/src/cuda/ops/layer_norm_kernels.cu +++ b/lib/kernels/src/cuda/ops/layer_norm_kernels.cu @@ -15,7 +15,7 @@ #include "kernels/accessor.h" #include "kernels/datatype_dispatch.h" -#include "kernels/layer_norm_kernels.h" +#include "kernels/layer_norm_kernels_gpu.h" namespace FlexFlow { @@ -289,12 +289,12 @@ __global__ void GammaBetaBackwardCUDAKernel(int64_t M, } // TODO: handle any data type for stats -LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const &handle, - Allocator &allocator, - bool elementwise_affine_, - int64_t effective_batch_size_, - int64_t effective_num_elements_, - float eps_) { +LayerNormPerDeviceState gpu_init_kernel(PerDeviceFFHandle const &handle, + Allocator &allocator, + bool elementwise_affine_, + int64_t effective_batch_size_, + int64_t effective_num_elements_, + float eps_) { float *mean = (float *)allocator.allocate(sizeof(float) * effective_batch_size_); float *rstd = @@ -307,18 +307,20 @@ LayerNormPerDeviceState init_kernel(PerDeviceFFHandle const &handle, (float *)allocator.allocate(sizeof(float) * effective_batch_size_); float *bias = (float *)allocator.allocate(sizeof(float) * effective_batch_size_); - LayerNormPerDeviceState per_device_state = {handle, - elementwise_affine_, - effective_batch_size_, - effective_num_elements_, - eps_, - mean, - rstd, - ds, - db, - scale, - bias, - DataType::FLOAT}; + LayerNormPerDeviceState per_device_state = LayerNormPerDeviceState{ + /*handle=*/handle, + /*elementwise_affine=*/elementwise_affine_, + /*effective_num_elements=*/effective_num_elements_, + /*effective_batch_size=*/effective_batch_size_, + /*eps=*/eps_, + /*mean=*/mean, + /*rstd=*/rstd, + /*ds=*/ds, + /*db=*/db, + /*scale=*/scale, + /*bias=*/bias, + /*data_type=*/DataType::FLOAT, + }; return per_device_state; } @@ -407,24 +409,24 @@ struct BackwardKernel { } }; -void forward_kernel(cudaStream_t stream, - LayerNormPerDeviceState const &m, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - GenericTensorAccessorW const &gamma, - GenericTensorAccessorW const &beta) { +void gpu_forward_kernel(cudaStream_t stream, + LayerNormPerDeviceState const &m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorW const &gamma, + GenericTensorAccessorW const &beta) { DataTypeDispatch1{}( m.data_type, stream, m, input, output, gamma, beta); } -void backward_kernel(cudaStream_t stream, - LayerNormPerDeviceState const &m, - GenericTensorAccessorR const &output_grad, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &input_grad, - GenericTensorAccessorR const &gamma, - GenericTensorAccessorW const &gamma_grad, - GenericTensorAccessorW const &beta_grad) { +void gpu_backward_kernel(cudaStream_t stream, + LayerNormPerDeviceState const &m, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad) { DataTypeDispatch1{}(m.data_type, stream, m, @@ -436,6 +438,10 @@ void backward_kernel(cudaStream_t stream, beta_grad); } +void gpu_cleanup_kernel(LayerNormPerDeviceState const &per_device_state) { + NOT_IMPLEMENTED(); +} + } // namespace LayerNorm } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/src/cuda/ops/linear_kernels.cu b/lib/kernels/src/cuda/ops/linear_kernels.cu index 02bda55828..fa474d854a 100644 --- a/lib/kernels/src/cuda/ops/linear_kernels.cu +++ b/lib/kernels/src/cuda/ops/linear_kernels.cu @@ -15,7 +15,7 @@ #include "internal/device.h" #include "kernels/allocation.h" -#include "kernels/linear_kernels.h" +#include "kernels/linear_kernels_gpu.h" #include "utils/integer_conversions.h" namespace FlexFlow { @@ -23,7 +23,7 @@ namespace FlexFlow { namespace Kernels { namespace Linear { -bool use_activation(std::optional activation) { +static bool use_activation(std::optional activation) { if (activation.has_value()) { switch (activation.value()) { case Activation::RELU: @@ -40,17 +40,16 @@ bool use_activation(std::optional activation) { return false; } -// what's the float * one_ptr -LinearPerDeviceState init_kernel(PerDeviceFFHandle handle, - float *one_ptr, - std::optional activation, - std::optional regularizer, - bool use_bias, - DataType input_type, - DataType weight_type, - DataType output_type, - int batch_size, - int channel) { +LinearPerDeviceState + gpu_init_kernel(PerDeviceFFHandle handle, + std::optional activation, + std::optional regularizer, + bool use_bias, + DataType input_type, + DataType weight_type, + DataType output_type, + int batch_size, + int channel) { ffTensorDescriptor_t outputTensor; ffActivationDescriptor_t actiDesc; checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor)); @@ -82,6 +81,8 @@ LinearPerDeviceState init_kernel(PerDeviceFFHandle handle, // Unsupported activation mode assert(false); } + } else { + mode = CUDNN_ACTIVATION_IDENTITY; } checkCUDNN( cudnnSetActivationDescriptor(actiDesc, mode, CUDNN_PROPAGATE_NAN, 0.0)); @@ -91,30 +92,41 @@ LinearPerDeviceState init_kernel(PerDeviceFFHandle handle, // todo: how to use allocator to allocate memory for float * one_ptr, how many // bytes to allocate? + float *one_ptr; checkCUDA(cudaMalloc(&one_ptr, sizeof(float) * batch_size)); - LinearPerDeviceState per_device_state = {handle, - outputTensor, - actiDesc, - one_ptr, - mode, - activation, - regularizer, - use_bias, - input_type, - weight_type, - output_type}; + float one_ptr_cpu[batch_size]; + for (int i = 0; i < batch_size; i++) { + one_ptr_cpu[i] = 1.0; + } + checkCUDA(cudaMemcpy(one_ptr, + one_ptr_cpu, + sizeof(float) * batch_size, + cudaMemcpyHostToDevice)); + LinearPerDeviceState per_device_state = LinearPerDeviceState{ + /*handle=*/handle, + /*outputTensor=*/outputTensor, + /*actiDesc=*/actiDesc, + /*one_ptr=*/one_ptr, + /*mode=*/mode, + /*activation=*/activation, + /*regularizer=*/regularizer, + /*use_bias=*/use_bias, + /*input_type=*/input_type, + /*weight_type=*/weight_type, + /*output_type=*/output_type, + }; return per_device_state; } -void forward_kernel(cudaStream_t stream, - LinearPerDeviceState const &m, - float const *input_ptr, - float *output_ptr, - float const *weight_ptr, - float const *bias_ptr, - int in_dim, - int out_dim, - int batch_size) { +void gpu_forward_kernel(cudaStream_t stream, + LinearPerDeviceState const &m, + float const *input_ptr, + float *output_ptr, + float const *weight_ptr, + float const *bias_ptr, + int in_dim, + int out_dim, + int batch_size) { checkCUBLAS(cublasSetStream(m.handle.blas, stream)); checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); @@ -147,10 +159,9 @@ void forward_kernel(cudaStream_t stream, out_dim, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - // use_bias = True - if (bias_ptr != NULL) { + if (bias_ptr != nullptr) { checkCUBLAS(cublasGemmEx(m.handle.blas, - CUBLAS_OP_T, + CUBLAS_OP_N, CUBLAS_OP_N, out_dim, batch_size, @@ -158,7 +169,7 @@ void forward_kernel(cudaStream_t stream, &alpha, static_cast(bias_ptr), weight_type, - 1, + out_dim, static_cast(m.one_ptr), CUDA_R_32F, 1, @@ -169,38 +180,38 @@ void forward_kernel(cudaStream_t stream, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } - if (use_activation(m.activation)) { - checkCUDNN(cudnnActivationForward(m.handle.dnn, - m.actiDesc, - &alpha, - m.outputTensor, - static_cast(output_ptr), - &beta, - m.outputTensor, - static_cast(output_ptr))); - } else if (m.activation == Activation::GELU) { - size_t elements = size_t_from_int(out_dim) * size_t_from_int(batch_size); - constexpr float B = 0.7978845608028654f; // sqrt(2.0/M_PI) - constexpr float C = 0.035677408136300125f; // 0.044715 * sqrt(2.0/M_PI) - gelu_forward_kernel<<>>( - elements, B, C, (float *)output_ptr); - } else { - // Do nothing - } + // if (use_activation(m.activation)) { + // checkCUDNN(cudnnActivationForward(m.handle.dnn, + // m.actiDesc, + // &alpha, + // m.outputTensor, + // static_cast(output_ptr), + // &beta, + // m.outputTensor, + // static_cast(output_ptr))); + // } else if (m.activation == Activation::GELU) { + // size_t elements = size_t_from_int(out_dim) * size_t_from_int(batch_size); + // constexpr float B = 0.7978845608028654f; // sqrt(2.0/M_PI) + // constexpr float C = 0.035677408136300125f; // 0.044715 * sqrt(2.0/M_PI) + // gelu_forward_kernel<<>>( + // elements, B, C, (float *)output_ptr); + // } else { + // // Do nothing + // } } -void backward_kernel(cudaStream_t stream, - LinearPerDeviceState const &m, - float const *output_ptr, - float *output_grad_ptr, - float const *input_ptr, - float *input_grad_ptr, - float const *kernel_ptr, - float *kernel_grad_ptr, - float *bias_grad_ptr, - int in_dim, - int out_dim, - int batch_size) { +void gpu_backward_kernel(cudaStream_t stream, + LinearPerDeviceState const &m, + float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, + float const *kernel_ptr, + float *kernel_grad_ptr, + float *bias_grad_ptr, + int in_dim, + int out_dim, + int batch_size) { checkCUBLAS(cublasSetStream(m.handle.blas, stream)); checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); float alpha = 1.0f; @@ -229,9 +240,10 @@ void backward_kernel(cudaStream_t stream, stream); } else { // TODO: only support relu and sigmoid for now - assert(false && "Unsupported activation for Linear"); + PANIC("Unsupported activation for Linear", m.activation.value()); } } + // Compute weight gradiant // NOTE: we use alpha=1 for kernel_grad to accumulate gradients checkCUBLAS(cublasGemmEx(m.handle.blas, @@ -328,6 +340,10 @@ void backward_kernel(cudaStream_t stream, } } +void gpu_cleanup_kernel(LinearPerDeviceState &per_device_state) { + NOT_IMPLEMENTED(); +} + } // namespace Linear } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/src/cuda/ops/partition_kernels.cu b/lib/kernels/src/cuda/ops/partition_kernels.cu deleted file mode 100644 index 94690a74fb..0000000000 --- a/lib/kernels/src/cuda/ops/partition_kernels.cu +++ /dev/null @@ -1,79 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "internal/device.h" -#include "kernels/datatype_dispatch.h" -#include "kernels/partition_kernels.h" - -namespace FlexFlow { -namespace Kernels { -namespace Repartition { - -template -struct ForwardKernel { - void operator()(cudaStream_t stream, - RepartitionPerDeviceState const &m, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - checkCUDA( - cudaMemcpyAsync(output.get(), - input.get(), - input.shape.num_elements().int_from_positive_int() * - size_of_datatype(T).int_from_positive_int(), - cudaMemcpyDeviceToDevice, - stream)); - } -}; - -template -struct BackwardKernel { - void operator()(cudaStream_t stream, - RepartitionPerDeviceState const &m, - GenericTensorAccessorR const &output_grad, - GenericTensorAccessorW const &input_grad) { - add_kernel> - <<>>(input_grad.get(), - output_grad.get(), - input_grad.shape.num_elements().int_from_positive_int()); - } -}; - -RepartitionPerDeviceState init_kernel(PerDeviceFFHandle const &handle, - DataType data_type) { - RepartitionPerDeviceState per_device_state = {handle, data_type}; - return per_device_state; -} - -void forward_kernel(cudaStream_t stream, - RepartitionPerDeviceState const &m, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - DataTypeDispatch1{}(m.data_type, stream, m, input, output); -} - -void backward_kernel(cudaStream_t stream, - RepartitionPerDeviceState const &m, - GenericTensorAccessorR const &output_grad, - GenericTensorAccessorW const &input_grad) { - DataTypeDispatch1{}( - m.data_type, stream, m, output_grad, input_grad); -} - -} // namespace Repartition -} // namespace Kernels -} // namespace FlexFlow diff --git a/lib/kernels/src/cuda/ops/pool_2d_kernels.cu b/lib/kernels/src/cuda/ops/pool_2d_kernels.cu index e8ea3f64c2..ec185a360e 100644 --- a/lib/kernels/src/cuda/ops/pool_2d_kernels.cu +++ b/lib/kernels/src/cuda/ops/pool_2d_kernels.cu @@ -14,30 +14,30 @@ */ #include "internal/device.h" -#include "kernels/pool_2d_kernels.h" +#include "kernels/pool_2d_kernels_gpu.h" namespace FlexFlow { namespace Kernels { namespace Pool2D { -Pool2DPerDeviceState init_kernel(PerDeviceFFHandle handle, - std::optional activation, - int input_w, - int input_h, - int input_c, - int input_n, - int output_w, - int output_h, - int output_c, - int output_n, - int pad_h, - int pad_w, - int kernel_h, - int kernel_w, - int stride_h, - int stride_w, - PoolOp pool_type) { +Pool2DPerDeviceState gpu_init_kernel(PerDeviceFFHandle handle, + std::optional activation, + int input_w, + int input_h, + int input_c, + int input_n, + int output_w, + int output_h, + int output_c, + int output_n, + int pad_h, + int pad_w, + int kernel_h, + int kernel_w, + int stride_h, + int stride_w, + PoolOp pool_type) { ffTensorDescriptor_t inputTensor; ffTensorDescriptor_t outputTensor; ffActivationDescriptor_t actiDesc; @@ -87,15 +87,21 @@ Pool2DPerDeviceState init_kernel(PerDeviceFFHandle handle, if (activation == Activation::RELU) { relu = true; } - Pool2DPerDeviceState state = { - handle, inputTensor, outputTensor, actiDesc, poolDesc, relu}; + Pool2DPerDeviceState state = Pool2DPerDeviceState{ + /*handle=*/handle, + /*inputTensor=*/inputTensor, + /*outputTensor=*/outputTensor, + /*actiDesc=*/actiDesc, + /*poolDesc=*/poolDesc, + /*relu=*/relu, + }; return state; } -void forward_kernel(cudaStream_t stream, - Pool2DPerDeviceState const &m, - void const *input_ptr, - void *output_ptr) { +void gpu_forward_kernel(cudaStream_t stream, + Pool2DPerDeviceState const &m, + void const *input_ptr, + void *output_ptr) { checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); @@ -110,12 +116,12 @@ void forward_kernel(cudaStream_t stream, output_ptr)); } -void backward_kernel(cudaStream_t stream, - Pool2DPerDeviceState const &m, - void const *output_ptr, - void const *output_grad_ptr, - void const *input_ptr, - void *input_grad_ptr) { +void gpu_backward_kernel(cudaStream_t stream, + Pool2DPerDeviceState const &m, + void const *output_ptr, + void const *output_grad_ptr, + void const *input_ptr, + void *input_grad_ptr) { checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); @@ -134,6 +140,10 @@ void backward_kernel(cudaStream_t stream, input_grad_ptr)); } +void gpu_cleanup_kernel(Pool2DPerDeviceState &per_device_state) { + NOT_IMPLEMENTED(); +} + } // namespace Pool2D } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/src/cuda/ops/reduce_kernels.cu b/lib/kernels/src/cuda/ops/reduce_kernels.cu index 563bbae21d..20c974e4d8 100644 --- a/lib/kernels/src/cuda/ops/reduce_kernels.cu +++ b/lib/kernels/src/cuda/ops/reduce_kernels.cu @@ -14,17 +14,17 @@ */ #include "internal/device.h" -#include "kernels/reduce_kernels.h" +#include "kernels/reduce_kernels_gpu.h" namespace FlexFlow { namespace Kernels { namespace Reduce { -ReducePerDeviceState init_kernel(PerDeviceFFHandle const &handle, - OperatorType const &op_type, - size_t const &reduction_size, - ArrayShape const &input_shape, - ArrayShape const &output_shape) { +ReducePerDeviceState gpu_init_kernel(PerDeviceFFHandle const &handle, + OperatorType const &op_type, + size_t const &reduction_size, + TensorShape const &input_shape, + TensorShape const &output_shape) { ffTensorDescriptor_t inputTensor; ffTensorDescriptor_t outputTensor; @@ -35,19 +35,25 @@ ReducePerDeviceState init_kernel(PerDeviceFFHandle const &handle, checkCUDNN(cudnnCreateReduceTensorDescriptor(&reduceDesc)); - checkCUDNN(cudnnSetTensorDescriptorFromArrayShape(inputTensor, input_shape)); + checkCUDNN(cudnnSetTensorDescriptorFromTensorShape(inputTensor, input_shape)); checkCUDNN( - cudnnSetTensorDescriptorFromArrayShape(outputTensor, output_shape)); + cudnnSetTensorDescriptorFromTensorShape(outputTensor, output_shape)); - ReducePerDeviceState per_device = { - handle, inputTensor, outputTensor, reduceDesc, op_type, reduction_size}; + ReducePerDeviceState per_device = ReducePerDeviceState{ + /*handle=*/handle, + /*inputTensor=*/inputTensor, + /*outputTensor=*/outputTensor, + /*reduceDesc=*/reduceDesc, + /*op_type=*/op_type, + /*reduction_size=*/reduction_size, + }; return per_device; } -void forward_kernel(cudaStream_t stream, - ReducePerDeviceState const &m, - float const *input_ptr, - float *output_ptr) { +void gpu_forward_kernel(cudaStream_t stream, + ReducePerDeviceState const &m, + float const *input_ptr, + float *output_ptr) { checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); float alpha = 1.0f, beta = 0.0f; checkCUDNN(cudnnReduceTensor(m.handle.dnn, @@ -64,10 +70,10 @@ void forward_kernel(cudaStream_t stream, output_ptr)); }; -void backward_kernel(cudaStream_t stream, - ReducePerDeviceState const &m, - float const *output_grad_ptr, - float *input_grad_ptr) { +void gpu_backward_kernel(cudaStream_t stream, + ReducePerDeviceState const &m, + float const *output_grad_ptr, + float *input_grad_ptr) { checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); float alpha = 1.0, beta = 1.0f; switch (m.op_type) { diff --git a/lib/kernels/src/cuda/ops/reduction_kernels.cu b/lib/kernels/src/cuda/ops/reduction_kernels.cu deleted file mode 100644 index 93400d333f..0000000000 --- a/lib/kernels/src/cuda/ops/reduction_kernels.cu +++ /dev/null @@ -1,86 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "internal/device.h" -#include "kernels/datatype_dispatch.h" -#include "kernels/reduction_kernels.h" - -namespace FlexFlow { -namespace Kernels { -namespace Reduction { - -template -__global__ void reduction_forward_kernel(T const *input_ptr, - T *output_ptr, - size_t num_elements, - size_t num_replicas) { - CUDA_KERNEL_LOOP(i, num_elements) { - output_ptr[i] = input_ptr[i]; - for (size_t j = 1; j < num_replicas; j++) { - output_ptr[i] += input_ptr[i + j * num_elements]; - } - } -} - -template -struct ForwardKernel { - void operator()(cudaStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - size_t num_replicas) { - - size_t total_elements = - input.shape.num_elements().int_from_positive_int() * num_replicas; - reduction_forward_kernel> - <<>>( - input.get(), - output.get(), - input.shape.num_elements().int_from_positive_int(), - num_replicas); - } -}; - -template -struct BackwardKernel { - void operator()(cudaStream_t stream, - GenericTensorAccessorR const &output, - GenericTensorAccessorW const &input) { - checkCUDA( - cudaMemcpyAsync(input.get(), - output.get(), - input.shape.num_elements().int_from_positive_int() * - size_of_datatype(T).int_from_positive_int(), - cudaMemcpyDeviceToDevice, - stream)); - } -}; - -void forward_kernel(cudaStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output, - size_t num_replicas) { - DataTypeDispatch1{}( - input.data_type, stream, input, output, num_replicas); -} - -void backward_kernel(cudaStream_t stream, - GenericTensorAccessorR const &output, - GenericTensorAccessorW const &input) { - DataTypeDispatch1{}(output.data_type, stream, output, input); -} - -} // namespace Reduction -} // namespace Kernels -} // namespace FlexFlow diff --git a/lib/kernels/src/cuda/ops/replicate_kernels.cu b/lib/kernels/src/cuda/ops/replicate_kernels.cu deleted file mode 100644 index 9f532c96b1..0000000000 --- a/lib/kernels/src/cuda/ops/replicate_kernels.cu +++ /dev/null @@ -1,84 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "internal/device.h" -#include "kernels/datatype_dispatch.h" -#include "kernels/replicate_kernels.h" - -namespace FlexFlow { -namespace Kernels { -namespace Replicate { - -template -__global__ void replicate_backward_kernel(T const *output_ptr, - T *input_ptr, - size_t num_elements, - size_t num_replicas) { - CUDA_KERNEL_LOOP(i, num_elements) { - for (size_t j = 0; j < num_replicas; j++) { - input_ptr[i] += output_ptr[i + j * num_elements]; - } - } -} - -template -struct ForwardKernel { - void operator()(cudaStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - checkCUDA( - cudaMemcpyAsync((void *)output.get(), - (void *)input.get(), - input.shape.num_elements().int_from_positive_int() * - size_of_datatype(T).int_from_positive_int(), - cudaMemcpyDeviceToDevice, - stream)); - } -}; - -template -struct BackwardKernel { - void operator()(cudaStream_t stream, - GenericTensorAccessorR const &output, - GenericTensorAccessorW const &input, - size_t num_replicas) { - size_t total_elements = - input.shape.num_elements().int_from_positive_int() * num_replicas; - replicate_backward_kernel> - <<>>( - output.get(), - input.get(), - input.shape.num_elements().int_from_positive_int(), - num_replicas); - } -}; - -void forward_kernel(cudaStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - DataTypeDispatch1{}(input.data_type, stream, input, output); -} - -void backward_kernel(cudaStream_t stream, - GenericTensorAccessorR const &output, - GenericTensorAccessorW const &input, - size_t num_replicas) { - DataTypeDispatch1{}( - input.data_type, stream, output, input, num_replicas); -} - -} // namespace Replicate -} // namespace Kernels -} // namespace FlexFlow diff --git a/lib/kernels/src/cuda/ops/reshape_kernels.cu b/lib/kernels/src/cuda/ops/reshape_kernels.cu index 3f0d6bb15a..1414706ef0 100644 --- a/lib/kernels/src/cuda/ops/reshape_kernels.cu +++ b/lib/kernels/src/cuda/ops/reshape_kernels.cu @@ -15,61 +15,52 @@ #include "internal/device.h" #include "kernels/datatype_dispatch.h" -#include "kernels/reshape_kernels.h" +#include "kernels/reshape_kernels_gpu.h" namespace FlexFlow { namespace Kernels { namespace Reshape { -ReshapePerDeviceState init_kernel(DataType data_type) { - return ReshapePerDeviceState{data_type}; -} - -template -struct ForwardKernel { - void operator()(cudaStream_t stream, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - checkCUDA( - cudaMemcpyAsync(output.get(), - input.get(), - input.shape.num_elements().int_from_positive_int() * - size_of_datatype(T).int_from_positive_int(), - cudaMemcpyDeviceToDevice, - stream)); +template +__global__ void apply_add_with_scale2(DT *data_ptr, + DTGrad const *grad_ptr, + size_t size, + DT scale) { + CUDA_KERNEL_LOOP(i, size) { + data_ptr[i] += grad_ptr[i] * scale; } -}; +} -template +template struct BackwardKernel { void operator()(cudaStream_t stream, GenericTensorAccessorR const &output, GenericTensorAccessorW const &input) { float alpha = 1.0f; - apply_add_with_scale> - <<, real_type_t> + <<>>(input.get(), - output.get(), - input.shape.num_elements().int_from_positive_int(), - static_cast>(alpha)); + stream>>>(input.get(), + output.get(), + get_num_elements(input.shape.dims).int_from_positive_int(), + static_cast>(alpha)); } }; -void forward_kernel(cudaStream_t stream, - ReshapePerDeviceState const &m, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { - DataTypeDispatch1{}(m.data_type, stream, input, output); +void gpu_forward_kernel(cudaStream_t stream, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + copy_accessor_data_to_l_from_r(output, input); } -void backward_kernel(cudaStream_t stream, - ReshapePerDeviceState const &m, - GenericTensorAccessorR const &output, - GenericTensorAccessorW const &input) { - DataTypeDispatch1{}(m.data_type, stream, output, input); +void gpu_backward_kernel(cudaStream_t stream, + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + DataTypeDispatch2{}( + input.shape.data_type, output.shape.data_type, stream, output, input); } } // namespace Reshape diff --git a/lib/kernels/src/cuda/ops/reverse_kernels.cu b/lib/kernels/src/cuda/ops/reverse_kernels.cu index c63be7f9b4..1fabf4a67e 100644 --- a/lib/kernels/src/cuda/ops/reverse_kernels.cu +++ b/lib/kernels/src/cuda/ops/reverse_kernels.cu @@ -14,7 +14,7 @@ */ #include "internal/device.h" -#include "kernels/reverse_kernels.h" +#include "kernels/reverse_kernels_gpu.h" #include "kernels/reverse_kernels_params.h" namespace FlexFlow::Kernels::Reverse { @@ -51,13 +51,13 @@ static void forward_kernel_internal(cudaStream_t stream, in_ptr, out_ptr, num_out_blks, reverse_dim_size, in_blk_size); } -void forward_kernel(ffStream_t stream, - GenericTensorAccessorR const &input_accessor, - GenericTensorAccessorW &output_accessor, - ReverseAttrs const &attrs) { +void gpu_forward_kernel(ffStream_t stream, + GenericTensorAccessorR const &input_accessor, + GenericTensorAccessorW &output_accessor, + ReverseAttrs const &attrs) { auto reverse_kernels_params = - compute_reverse_kernels_params(output_accessor.shape, attrs); + compute_reverse_kernels_params(output_accessor.shape.dims, attrs); forward_kernel_internal( stream, @@ -84,12 +84,12 @@ void backward_kernel_internal(cudaStream_t stream, out_grad_ptr, in_grad_ptr, num_out_blks, reverse_dim_size, in_blk_size); } -void backward_kernel(ffStream_t stream, - GenericTensorAccessorR const &output_grad_accessor, - GenericTensorAccessorW &input_grad_accessor, - ReverseAttrs const &attrs) { +void gpu_backward_kernel(ffStream_t stream, + GenericTensorAccessorR const &output_grad_accessor, + GenericTensorAccessorW &input_grad_accessor, + ReverseAttrs const &attrs) { auto reverse_kernels_params = - compute_reverse_kernels_params(input_grad_accessor.shape, attrs); + compute_reverse_kernels_params(input_grad_accessor.shape.dims, attrs); backward_kernel_internal( stream, diff --git a/lib/kernels/src/cuda/ops/softmax_kernels.cu b/lib/kernels/src/cuda/ops/softmax_kernels.cu index da0ffd846e..85575d7bf6 100644 --- a/lib/kernels/src/cuda/ops/softmax_kernels.cu +++ b/lib/kernels/src/cuda/ops/softmax_kernels.cu @@ -14,19 +14,19 @@ */ #include "internal/device.h" -#include "kernels/softmax_kernels.h" +#include "kernels/softmax_kernels_gpu.h" namespace FlexFlow { namespace Kernels { namespace Softmax { -SoftmaxPerDeviceState init_kernel(PerDeviceFFHandle const &handle, - int dim, - int input_n, - int input_c, - int input_h, - int input_w) { +SoftmaxPerDeviceState gpu_init_kernel(PerDeviceFFHandle const &handle, + ff_dim_t dim, + int input_n, + int input_c, + int input_h, + int input_w) { ffTensorDescriptor_t inputTensor; checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor)); @@ -38,14 +38,18 @@ SoftmaxPerDeviceState init_kernel(PerDeviceFFHandle const &handle, input_h, input_w)); - SoftmaxPerDeviceState per_device_state = {handle, inputTensor, dim}; + SoftmaxPerDeviceState per_device_state = SoftmaxPerDeviceState{ + /*handle=*/handle, + /*inputTensor=*/inputTensor, + /*dim=*/dim, + }; return per_device_state; } -void forward_kernel(cudaStream_t stream, - SoftmaxPerDeviceState const &m, - float const *input_ptr, - float *output_ptr) { +void gpu_forward_kernel(cudaStream_t stream, + SoftmaxPerDeviceState const &m, + float const *input_ptr, + float *output_ptr) { checkCUDNN(cudnnSetStream(m.handle.dnn, stream)); float alpha = 1.0f, beta = 0.0f; @@ -60,10 +64,10 @@ void forward_kernel(cudaStream_t stream, output_ptr)); } -void backward_kernel(cudaStream_t stream, - float const *output_grad_ptr, - float *input_grad_ptr, - size_t num_elements) { +void gpu_backward_kernel(cudaStream_t stream, + float const *output_grad_ptr, + float *input_grad_ptr, + size_t num_elements) { checkCUDA(cudaMemcpyAsync(input_grad_ptr, output_grad_ptr, @@ -72,6 +76,10 @@ void backward_kernel(cudaStream_t stream, stream)); } +void gpu_cleanup_kernel(SoftmaxPerDeviceState &) { + NOT_IMPLEMENTED(); +} + } // namespace Softmax } // namespace Kernels } // namespace FlexFlow diff --git a/lib/kernels/src/cuda/ops/split_kernels.cu b/lib/kernels/src/cuda/ops/split_kernels.cu index 5c8b305851..ca953bd7b1 100644 --- a/lib/kernels/src/cuda/ops/split_kernels.cu +++ b/lib/kernels/src/cuda/ops/split_kernels.cu @@ -14,20 +14,20 @@ */ #include "internal/device.h" -#include "kernels/split_kernels.h" +#include "kernels/split_kernels_gpu.h" namespace FlexFlow { namespace Kernels { namespace Split { -void forward_kernel(cudaStream_t stream, - float **out_ptrs, - float const *in_ptr, - coord_t const *out_blk_sizes, - coord_t in_blk_size, - coord_t num_blks, - int numOutputs) { +void gpu_forward_kernel(cudaStream_t stream, + float **out_ptrs, + float const *in_ptr, + int const *out_blk_sizes, + int in_blk_size, + int num_blks, + int numOutputs) { for (int i = 0; i < numOutputs; i++) { copy_with_stride<< struct LinearData { typedef Entry Entry; @@ -369,15 +364,14 @@ __global__ void topk_forward_kernel(T const *__restrict__ input, } } -void forward_kernel(cudaStream_t stream, - TopKPerDeviceState const &m, - float const *input_ptr, - float *output_ptr, - int *indices_ptr, - size_t batch_size, - int length, - int k, - bool sorted) { +void gpu_forward_kernel(cudaStream_t stream, + float const *input_ptr, + float *output_ptr, + int *indices_ptr, + size_t batch_size, + int length, + int k, + bool sorted) { // Adopted from TensorFlow's TopK implementation // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/topk_op_gpu.h int num_shards = 0; @@ -421,14 +415,13 @@ __global__ void topk_backward_kernel(T const *__restrict__ value_grad_ptr, } } -void backward_kernel(cudaStream_t stream, - TopKPerDeviceState const &m, - float const *value_grad_ptr, - int const *indices_ptr, - float *in_grad_ptr, - size_t batch_size, - int length, - int k) { +void gpu_backward_kernel(cudaStream_t stream, + float const *value_grad_ptr, + int const *indices_ptr, + float *in_grad_ptr, + size_t batch_size, + int length, + int k) { topk_backward_kernel<< return legion_ordered_perm; } -void forward_kernel(cudaStream_t stream, - TransposeAttrs const &m, - GenericTensorAccessorR const &input, - GenericTensorAccessorW const &output) { +void gpu_forward_kernel(cudaStream_t stream, + TransposeAttrs const &m, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { TransposeStrides info; - info.num_dim = input.shape.num_dims().unwrap_nonnegative(); + info.num_dim = get_num_dims(input.shape.dims).unwrap_nonnegative(); assert(info.num_dim == m.perm.size()); LegionOrdered legion_ordered_perm = @@ -76,10 +76,12 @@ void forward_kernel(cudaStream_t stream, info.in_strides[i] = 1; info.out_strides[i] = 1; } else { - int in_dim_size = input.shape.at(legion_dim_t{nonnegative_int{i}}) - .int_from_positive_int(); - int out_dim_size = output.shape.at(legion_dim_t{nonnegative_int{i}}) - .int_from_positive_int(); + int in_dim_size = + dim_at_idx(input.shape.dims, legion_dim_t{nonnegative_int{i}}) + .int_from_positive_int(); + int out_dim_size = + dim_at_idx(output.shape.dims, legion_dim_t{nonnegative_int{i}}) + .int_from_positive_int(); info.in_strides[i] = info.in_strides[i - 1] * in_dim_size; info.out_strides[i] = info.out_strides[i - 1] * out_dim_size; } @@ -88,23 +90,23 @@ void forward_kernel(cudaStream_t stream, .value.unwrap_nonnegative(); } transpose_simple_kernel<<< - GET_BLOCKS(output.shape.num_elements().int_from_positive_int()), + GET_BLOCKS(get_num_elements(output.shape.dims).int_from_positive_int()), CUDA_NUM_THREADS, 0, - stream>>>(output.shape.num_elements().int_from_positive_int(), + stream>>>(get_num_elements(output.shape.dims).int_from_positive_int(), input.get_float_ptr(), output.get_float_ptr(), info, - 0.0f /*beta*/); + /*beta=*/0.0f); } -void backward_kernel(cudaStream_t stream, - TransposeAttrs const &m, - GenericTensorAccessorR const &out_grad, - GenericTensorAccessorW const &in_grad) { +void gpu_backward_kernel(cudaStream_t stream, + TransposeAttrs const &m, + GenericTensorAccessorR const &out_grad, + GenericTensorAccessorW const &in_grad) { TransposeStrides info; - info.num_dim = in_grad.shape.num_dims().unwrap_nonnegative(); + info.num_dim = get_num_dims(in_grad.shape.dims).unwrap_nonnegative(); assert(info.num_dim == m.perm.size()); LegionOrdered legion_ordered_perm = @@ -115,10 +117,12 @@ void backward_kernel(cudaStream_t stream, info.in_strides[i] = 1; info.out_strides[i] = 1; } else { - int in_dim_size = out_grad.shape.at(legion_dim_t{nonnegative_int{i}}) - .int_from_positive_int(); - int out_dim_size = in_grad.shape.at(legion_dim_t{nonnegative_int{i}}) - .int_from_positive_int(); + int in_dim_size = + dim_at_idx(out_grad.shape.dims, legion_dim_t{nonnegative_int{i}}) + .int_from_positive_int(); + int out_dim_size = + dim_at_idx(in_grad.shape.dims, legion_dim_t{nonnegative_int{i}}) + .int_from_positive_int(); info.in_strides[i] = info.in_strides[i - 1] * in_dim_size; info.out_strides[i] = info.out_strides[i - 1] * out_dim_size; } @@ -126,14 +130,14 @@ void backward_kernel(cudaStream_t stream, .value.unwrap_nonnegative()] = i; } transpose_simple_kernel<<< - GET_BLOCKS(in_grad.shape.num_elements().int_from_positive_int()), + GET_BLOCKS(get_num_elements(in_grad.shape.dims).int_from_positive_int()), CUDA_NUM_THREADS, 0, - stream>>>(in_grad.shape.num_elements().int_from_positive_int(), + stream>>>(get_num_elements(in_grad.shape.dims).int_from_positive_int(), out_grad.get_float_ptr(), in_grad.get_float_ptr(), info, - 1.0f /*beta*/); + /*beta=*/1.0f); } } // namespace Transpose diff --git a/lib/kernels/src/cuda/optimizer_kernels.cu b/lib/kernels/src/cuda/optimizer_kernels.cu index 2fce3c5db9..f457ec762e 100644 --- a/lib/kernels/src/cuda/optimizer_kernels.cu +++ b/lib/kernels/src/cuda/optimizer_kernels.cu @@ -15,7 +15,7 @@ #include "internal/device.h" #include "kernels/nccl.h" -#include "kernels/optimizer_kernels.h" +#include "kernels/optimizer_kernels_gpu.h" #include "utils/exception.h" namespace FlexFlow { @@ -43,7 +43,7 @@ __global__ void sgd_update(size_t count, } } -__host__ void sgd_ps_update_task_gpu(ffStream_t stream, +__host__ void gpu_sgd_ps_update_task(ffStream_t stream, float lr, float momentum, bool nesterov, @@ -72,8 +72,7 @@ __host__ void sgd_ps_update_task_gpu(ffStream_t stream, weight_ptr); } -#ifdef FF_USE_NCCL -__host__ void sgd_nccl_update_task_gpu(ffStream_t stream, +__host__ void gpu_sgd_nccl_update_task(ffStream_t stream, float lr, float momentum, bool nesterov, @@ -92,7 +91,6 @@ __host__ void sgd_nccl_update_task_gpu(ffStream_t stream, sgd_update<<>>( size, lr, weight_decay, momentum, nesterov, w_grad_ptr, v_ptr, w_ptr); } -#endif // ================================================================== // Adam Optimizer @@ -134,7 +132,7 @@ __global__ void adam_update(int count, } } -__host__ void adam_ps_update_task_gpu(ffStream_t stream, +__host__ void gpu_adam_ps_update_task(ffStream_t stream, float alpha_t, float beta1, float beta2, @@ -166,8 +164,7 @@ __host__ void adam_ps_update_task_gpu(ffStream_t stream, w_ptr); } -#ifdef FF_USE_NCCL -__host__ void adam_nccl_update_task_gpu(ffStream_t stream, +__host__ void gpu_adam_nccl_update_task(ffStream_t stream, float alpha_t, float beta1, float beta2, @@ -200,6 +197,5 @@ __host__ void adam_nccl_update_task_gpu(ffStream_t stream, v_ptr, w_ptr); } -#endif } // namespace FlexFlow diff --git a/lib/kernels/src/ff_handle.cc b/lib/kernels/src/ff_handle.cc index 63ca6975fd..0ae8fdf81d 100644 --- a/lib/kernels/src/ff_handle.cc +++ b/lib/kernels/src/ff_handle.cc @@ -1,4 +1,5 @@ #include "kernels/ff_handle.h" +#include namespace FlexFlow { diff --git a/lib/kernels/src/internal/device.h b/lib/kernels/src/internal/device.h index 226c7ad174..2e0495ed33 100644 --- a/lib/kernels/src/internal/device.h +++ b/lib/kernels/src/internal/device.h @@ -1,10 +1,11 @@ #ifndef _FLEXFLOW_LIB_KERNELS_INCLUDE_INTERNAL_DEVICE_H #define _FLEXFLOW_LIB_KERNELS_INCLUDE_INTERNAL_DEVICE_H -#include "kernels/array_shape.h" #include "kernels/device.h" +#include "op-attrs/activation.dtg.h" #include "op-attrs/datatype.h" #include "op-attrs/operator_type.h" +#include "op-attrs/tensor_shape.dtg.h" #include namespace FlexFlow { @@ -131,8 +132,9 @@ __host__ void updateGAS(float *para_ptr, template void print_tensor(T const *ptr, size_t num_elements, char const *prefix); -ffStatus_t cudnnSetTensorDescriptorFromArrayShape(ffTensorDescriptor_t tensor, - ArrayShape const &shape); +ffStatus_t + cudnnSetTensorDescriptorFromTensorShape(ffTensorDescriptor_t tensor, + TensorShape const &tensor_shape); ffDataType_t ff_to_cuda_datatype(DataType type); diff --git a/lib/kernels/src/kernels/accessor.cc b/lib/kernels/src/kernels/accessor.cc index 5a1881eb66..868940bf6c 100644 --- a/lib/kernels/src/kernels/accessor.cc +++ b/lib/kernels/src/kernels/accessor.cc @@ -1,6 +1,9 @@ #include "kernels/accessor.h" #include "kernels/allocation.h" #include "kernels/datatype_dispatch.h" +#include "op-attrs/ff_ordered/get_idxs.h" +#include "op-attrs/tensor_dims_coord.h" +#include "op-attrs/tensor_shape.h" #include "utils/containers/reversed.h" #include "utils/containers/vector_of.h" #include "utils/nonnegative_int/nonnegative_range.h" @@ -8,33 +11,42 @@ namespace FlexFlow { -nonnegative_int - calculate_accessor_offset(LegionOrdered const &indices, - ArrayShape const &shape) { - ASSERT(indices.size() == shape.num_dims(), +nonnegative_int calculate_accessor_offset(TensorDimsCoord const &coord, + TensorDims const &tensor_dims) { + ASSERT(tensor_dims_coord_get_num_dims(coord) == get_num_dims(tensor_dims), "Number of indices does not match the number of dimensions"); nonnegative_int offset = 0_n; positive_int multiplier = 1_p; - for (legion_dim_t dim : reversed(vector_of(key_range(shape.dims)))) { - ASSERT(indices.at(dim) < shape.at(legion_dim_t{dim}), + for (ff_dim_t dim : reversed(get_idxs(tensor_dims.ff_ordered))) { + ASSERT(coord.ff_ordered.at(dim) < dim_at_idx(tensor_dims, dim), "Out of bounds access", dim); - offset += indices.at(dim) * multiplier; - multiplier *= shape.at(legion_dim_t{dim}); + offset += coord.ff_ordered.at(dim) * multiplier; + multiplier *= tensor_dims.ff_ordered.at(dim); } return offset; } +TensorShape + get_tensor_shape_for_accessor_r(GenericTensorAccessorR const &accessor) { + return accessor.shape; +} + +TensorShape + get_tensor_shape_for_accessor_w(GenericTensorAccessorW const &accessor) { + return accessor.shape; +} + void copy_accessor_data_to_l_from_r( - GenericTensorAccessorW &dst_accessor, + GenericTensorAccessorW const &dst_accessor, GenericTensorAccessorR const &src_accessor) { - size_t num_bytes = - dst_accessor.shape.num_elements().int_from_positive_int() * - size_of_datatype(dst_accessor.data_type).int_from_positive_int(); + size_t num_bytes = get_size_in_bytes(dst_accessor.shape) + .unwrap_num_bytes() + .unwrap_nonnegative(); DeviceType dst_device_type = dst_accessor.device_type; DeviceType src_device_type = src_accessor.device_type; @@ -65,18 +77,14 @@ GenericTensorAccessorW::operator GenericTensorAccessorR() const { } GenericTensorAccessorW::GenericTensorAccessorW( - DataType data_type, - ArrayShape const &shape, + TensorShape const &shape, void *ptr, DeviceType device_type = DeviceType::GPU) - : data_type(data_type), shape(shape), ptr(ptr), device_type(device_type) {} + : shape(shape), ptr(ptr), device_type(device_type) {} -std::tuple +std::tuple GenericTensorAccessorW::tie() const { - return std::tie(this->data_type, this->shape, this->ptr, this->device_type); + return std::tie(this->shape, this->ptr, this->device_type); } bool GenericTensorAccessorW::operator==( @@ -110,10 +118,10 @@ half *GenericTensorAccessorW::get_half_ptr() const { } std::string format_as(GenericTensorAccessorW const &a) { - return fmt::format("", - a.data_type, + return fmt::format("", a.shape, - a.ptr); + a.ptr, + a.device_type); } std::ostream &operator<<(std::ostream &s, GenericTensorAccessorW const &a) { @@ -121,18 +129,14 @@ std::ostream &operator<<(std::ostream &s, GenericTensorAccessorW const &a) { } GenericTensorAccessorR::GenericTensorAccessorR( - DataType data_type, - ArrayShape const &shape, + TensorShape const &shape, void const *ptr, DeviceType device_type = DeviceType::GPU) - : data_type(data_type), shape(shape), ptr(ptr), device_type(device_type) {} + : shape(shape), ptr(ptr), device_type(device_type) {} -std::tuple +std::tuple GenericTensorAccessorR::tie() const { - return std::tie(this->data_type, this->shape, this->ptr, this->device_type); + return std::tie(this->shape, this->ptr, this->device_type); } bool GenericTensorAccessorR::operator==( @@ -166,10 +170,10 @@ half const *GenericTensorAccessorR::get_half_ptr() const { } std::string format_as(GenericTensorAccessorR const &a) { - return fmt::format("", - a.data_type, + return fmt::format("", a.shape, - a.ptr); + a.ptr, + a.device_type); } std::ostream &operator<<(std::ostream &s, GenericTensorAccessorR const &a) { @@ -269,45 +273,20 @@ std::vector GenericTensorAccessorR read_only_accessor_from_write_accessor( GenericTensorAccessorW const &writable) { return GenericTensorAccessorR{ - writable.data_type, writable.shape, writable.ptr, writable.device_type, }; } -bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1, - GenericTensorAccessorR const &acc2) { - return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type; -} - -bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1, - GenericTensorAccessorW const &acc2) { - return acc1.shape == acc2.shape && acc1.data_type == acc2.data_type; -} - -bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor, - ArrayShape const &expected_shape, - DataType const &expected_dtype) { - return accessor.shape == expected_shape && - accessor.data_type == expected_dtype; -} - -bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor, - ArrayShape const &expected_shape, - DataType const &expected_dtype) { - return accessor.shape == expected_shape && - accessor.data_type == expected_dtype; -} - -std::pair - get_shape_and_datatype(GenericTensorAccessorR const &accessor) { - return std::make_pair(accessor.shape, accessor.data_type); +bool accessors_have_same_shape(GenericTensorAccessorR const &acc1, + GenericTensorAccessorR const &acc2) { + return acc1.shape == acc2.shape; } -std::pair - get_shape_and_datatype(GenericTensorAccessorW const &accessor) { - return std::make_pair(accessor.shape, accessor.data_type); +bool accessors_have_same_shape(GenericTensorAccessorW const &acc1, + GenericTensorAccessorW const &acc2) { + return acc1.shape == acc2.shape; } template int32_t diff --git a/lib/kernels/src/kernels/allocation.cc b/lib/kernels/src/kernels/allocation.cc index a6881d240a..93b86f1b6d 100644 --- a/lib/kernels/src/kernels/allocation.cc +++ b/lib/kernels/src/kernels/allocation.cc @@ -17,11 +17,10 @@ DeviceType Allocator::get_allocation_device_type() const { GenericTensorAccessorW Allocator::allocate_tensor(TensorShape const &tensor_shape) { - void *ptr = - this->allocate(get_size_in_bytes(tensor_shape).int_from_positive_int()); + void *ptr = this->allocate( + get_size_in_bytes(tensor_shape).unwrap_num_bytes().unwrap_nonnegative()); return GenericTensorAccessorW{ - tensor_shape.data_type, - array_shape_from_tensor_shape(tensor_shape), + tensor_shape, ptr, this->get_allocation_device_type(), }; diff --git a/lib/kernels/src/kernels/array_shape.cc b/lib/kernels/src/kernels/array_shape.cc deleted file mode 100644 index a1fb9bf09b..0000000000 --- a/lib/kernels/src/kernels/array_shape.cc +++ /dev/null @@ -1,150 +0,0 @@ -#include "kernels/array_shape.h" -#include "kernels/legion_ordered/slice.h" -#include "op-attrs/ff_ordered/ff_ordered_of.h" -#include "op-attrs/ff_ordered/get_idxs.h" -#include "op-attrs/ff_ordered/slice.h" -#include "utils/containers/cartesian_product.h" -#include "utils/containers/product.h" -#include "utils/containers/reversed.h" -#include "utils/containers/transform.h" -#include "utils/containers/unordered_set_of.h" -#include "utils/containers/vector_of.h" -#include "utils/hash/tuple.h" -#include "utils/hash/vector.h" -#include "utils/nonnegative_int/nonnegative_range.h" -#include "utils/nonnegative_int/num_elements.h" - -namespace FlexFlow { - -ArrayShape::ArrayShape(LegionOrdered const &input_dims) - : dims(input_dims) {} - -nonnegative_int ArrayShape::num_dims() const { - return ::FlexFlow::num_elements(this->dims); -} - -positive_int ArrayShape::num_elements() const { - return product(this->dims); -} - -positive_int ArrayShape::operator[](legion_dim_t idx) const { - return dims.at(idx); -} - -positive_int ArrayShape::at(legion_dim_t idx) const { - return dims.at(idx); -} - -positive_int ArrayShape::at(ff_dim_t idx) const { - return dims.at(legion_dim_from_ff_dim(idx, this->num_dims())); -} - -bool ArrayShape::operator==(ArrayShape const &other) const { - return this->tie() == other.tie(); -} - -bool ArrayShape::operator!=(ArrayShape const &other) const { - return this->tie() != other.tie(); -} - -ArrayShape - ArrayShape::sub_shape(ff_dim_t const &start, - std::optional const &maybe_end) const { - FFOrdered ff_ordered_dims = - ff_ordered_from_legion_ordered(this->dims); - FFOrdered sliced = slice(ff_ordered_dims, start, maybe_end); - return ArrayShape{legion_ordered_from_ff_ordered(sliced)}; -} - -ArrayShape - ArrayShape::sub_shape(legion_dim_t const &start, - std::optional const &maybe_end) const { - return ArrayShape{slice(this->dims, start, maybe_end)}; -} - -std::optional ArrayShape::at_maybe(legion_dim_t index) const { - if (index.value < dims.size()) { - return dims.at(index); - } else { - return std::nullopt; - } -} - -std::optional ArrayShape::at_maybe(ff_dim_t index) const { - return this->at_maybe(legion_dim_from_ff_dim(index, this->num_dims())); -} - -std::tuple const &> ArrayShape::tie() const { - return std::tie(this->dims); -} - -std::string format_as(ArrayShape const &x) { - std::ostringstream oss; - oss << ""; - return oss.str(); -} - -std::ostream &operator<<(std::ostream &s, ArrayShape const &x) { - return (s << fmt::to_string(x)); -} - -positive_int get_num_elements(ArrayShape const &shape) { - return shape.num_elements(); -} - -ArrayShape array_shape_from_tensor_shape(TensorShape const &tensor_shape) { - return ArrayShape{ - legion_ordered_from_ff_ordered(tensor_shape.dims.ff_ordered)}; -} - -TensorShape get_tensor_shape(ArrayShape const &shape, DataType dtype) { - return TensorShape{TensorDims{ff_ordered_from_legion_ordered(shape.dims)}, - dtype}; -} - -std::unordered_set get_ff_dim_t_set(ArrayShape const &shape) { - return unordered_set_of(get_idxs(ff_ordered_from_legion_ordered(shape.dims))); -} - -std::unordered_set get_array_coord_set(ArrayShape const &shape) { - std::vector> per_dim_ranges = transform( - vector_of(ff_ordered_from_legion_ordered(shape.dims)), - [](positive_int dim_size) -> std::vector { - return nonnegative_range(dim_size.nonnegative_int_from_positive_int()); - }); - - std::unordered_set> raw_points = - unordered_set_of(cartesian_product(per_dim_ranges)); - - return transform(raw_points, - [](std::vector const &raw_point) { - return ArrayCoord{ff_ordered_of(raw_point)}; - }); -} - -ArrayShape array_shape_drop_dims( - ArrayShape const &shape, - std::function const &should_drop_dim) { - std::vector result; - for (ff_dim_t idx : get_idxs(ff_ordered_from_legion_ordered(shape.dims))) { - if (!should_drop_dim(idx)) { - result.push_back(shape.at(idx)); - } - } - - return ArrayShape{legion_ordered_from_ff_ordered(ff_ordered_of(result))}; -} - -} // namespace FlexFlow - -namespace std { - -using namespace FlexFlow; - -size_t hash::operator()(ArrayShape const &s) const { - return get_std_hash(s.tie()); -} - -} // namespace std diff --git a/lib/kernels/src/kernels/attention_kernels.cc b/lib/kernels/src/kernels/attention_kernels.cc new file mode 100644 index 0000000000..f3b024d7c9 --- /dev/null +++ b/lib/kernels/src/kernels/attention_kernels.cc @@ -0,0 +1,125 @@ +#include "kernels/attention_kernels.h" +#include "kernels/attention_kernels_cpu.h" +#include "kernels/attention_kernels_gpu.h" + +namespace FlexFlow::Kernels::MultiHeadAttention { + +std::optional + init_kernel(DeviceType device_type, + device_handle_t const &per_device_ff_handle, + Allocator &allocator, + int num_samples, + int num_heads, + int qSize, + int kSize, + int vSize, + int qProjSize, + int kProjSize, + int vProjSize, + int oProjSize, + int qoSeqLength, + int kvSeqLength, + bool add_bias_kv) { + if (device_type == DeviceType::GPU) { + return gpu_init_kernel( + /*per_device_ff_handle=*/per_device_ff_handle.require_for_gpu(), + /*allocator=*/allocator, + /*num_samples=*/num_samples, + /*num_heads=*/num_heads, + /*qSize=*/qSize, + /*kSize=*/kSize, + /*vSize=*/vSize, + /*qProjSize=*/qProjSize, + /*kProjSize=*/kProjSize, + /*vProjSize=*/vProjSize, + /*oProjSize=*/oProjSize, + /*qoSeqLength=*/qoSeqLength, + /*kvSeqLength=*/kvSeqLength, + /*add_bias_kv=*/add_bias_kv); + } else { + ASSERT(per_device_ff_handle.is_for_cpu()); + ASSERT(device_type == DeviceType::CPU); + return std::nullopt; + } +} + +void forward_kernel(device_stream_t const &stream, + std::optional const &device_state, + float const *query_ptr, + float const *key_ptr, + float const *value_ptr, + float const *weight_ptr, + float *output_ptr) { + if (stream.is_gpu()) { + gpu_forward_kernel( + /*stream=*/stream.require_gpu(), + /*device_state=*/device_state.value(), + /*query_ptr=*/query_ptr, + /*key_ptr=*/key_ptr, + /*value_ptr=*/value_ptr, + /*weight_ptr=*/weight_ptr, + /*output_ptr=*/output_ptr); + } else { + ASSERT(stream.is_cpu()); + ASSERT(device_state == std::nullopt); + cpu_forward_kernel( + /*query_ptr=*/query_ptr, + /*key_ptr=*/key_ptr, + /*value_ptr=*/value_ptr, + /*weight_ptr=*/weight_ptr, + /*output_ptr=*/output_ptr); + } +} + +void backward_kernel(device_stream_t const &stream, + std::optional const &device_state, + float const *query_ptr, + float *query_grad_ptr, + float const *key_ptr, + float *key_grad_ptr, + float const *value_ptr, + float *value_grad_ptr, + float const *weight_ptr, + float *weight_grad_ptr, + float const *output_grad_ptr) { + if (stream.is_gpu()) { + gpu_backward_kernel( + /*stream=*/stream.require_gpu(), + /*device_state=*/device_state.value(), + /*query_ptr=*/query_ptr, + /*query_grad_ptr=*/query_grad_ptr, + /*key_ptr=*/key_ptr, + /*key_grad_ptr=*/key_grad_ptr, + /*value_ptr=*/value_ptr, + /*value_grad_ptr=*/value_grad_ptr, + /*weight_ptr=*/weight_ptr, + /*weight_grad_ptr=*/weight_grad_ptr, + /*output_grad_ptr=*/output_grad_ptr); + } else { + ASSERT(stream.is_cpu()); + ASSERT(device_state == std::nullopt); + cpu_backward_kernel( + /*query_ptr=*/query_ptr, + /*query_grad_ptr=*/query_grad_ptr, + /*key_ptr=*/key_ptr, + /*key_grad_ptr=*/key_grad_ptr, + /*value_ptr=*/value_ptr, + /*value_grad_ptr=*/value_grad_ptr, + /*weight_ptr=*/weight_ptr, + /*weight_grad_ptr=*/weight_grad_ptr, + /*output_grad_ptr=*/output_grad_ptr); + } +} + +void cleanup_kernel(DeviceType device_type, + Allocator &allocator, + std::optional const &device_state) { + if (device_type == DeviceType::GPU) { + gpu_cleanup_kernel(allocator, device_state.value()); + } else { + ASSERT(device_type == DeviceType::CPU); + ASSERT(device_state == std::nullopt); + } +} + +} // namespace FlexFlow::Kernels::MultiHeadAttention diff --git a/lib/kernels/src/kernels/attention_kernels_cpu.cc b/lib/kernels/src/kernels/attention_kernels_cpu.cc new file mode 100644 index 0000000000..5af254fa5e --- /dev/null +++ b/lib/kernels/src/kernels/attention_kernels_cpu.cc @@ -0,0 +1,25 @@ +#include "kernels/attention_kernels_cpu.h" + +namespace FlexFlow::Kernels::MultiHeadAttention { + +void cpu_forward_kernel(float const *query_ptr, + float const *key_ptr, + float const *value_ptr, + float const *weight_ptr, + float *output_ptr) { + NOT_IMPLEMENTED(); +} + +void cpu_backward_kernel(float const *query_ptr, + float *query_grad_ptr, + float const *key_ptr, + float *key_grad_ptr, + float const *value_ptr, + float *value_grad_ptr, + float const *weight_ptr, + float *weight_grad_ptr, + float const *output_grad_ptr) { + NOT_IMPLEMENTED(); +} + +} // namespace FlexFlow::Kernels::MultiHeadAttention diff --git a/lib/kernels/src/kernels/batch_matmul_kernels.cc b/lib/kernels/src/kernels/batch_matmul_kernels.cc new file mode 100644 index 0000000000..652d4fb137 --- /dev/null +++ b/lib/kernels/src/kernels/batch_matmul_kernels.cc @@ -0,0 +1,93 @@ +#include "kernels/batch_matmul_kernels.h" +#include "kernels/batch_matmul_kernels_cpu.h" +#include "kernels/batch_matmul_kernels_gpu.h" + +namespace FlexFlow::Kernels::BatchMatmul { + +void forward_kernel(device_stream_t const &stream, + device_handle_t const &handle, + float *output_ptr, + float const *a_input_ptr, + float const *b_input_ptr, + int m, + int n, + int k, + int batch, + int seq_length, + int a_seq_length_dim, + int b_seq_length_dim) { + if (stream.is_gpu()) { + gpu_forward_kernel( + /*stream=*/stream.require_gpu(), + /*handle=*/handle.require_for_gpu(), + /*output_ptr=*/output_ptr, + /*a_input_ptr=*/a_input_ptr, + /*b_input_ptr=*/b_input_ptr, + /*m=*/m, + /*n=*/n, + /*k=*/k, + /*batch=*/batch, + /*seq_length=*/seq_length, + /*a_seq_length_dim=*/a_seq_length_dim, + /*b_seq_length_dim=*/b_seq_length_dim); + } else { + ASSERT(stream.is_cpu()); + ASSERT(handle.is_for_cpu()); + cpu_forward_kernel( + /*output_ptr=*/output_ptr, + /*a_input_ptr=*/a_input_ptr, + /*b_input_ptr=*/b_input_ptr, + /*m=*/m, + /*n=*/n, + /*k=*/k, + /*batch=*/batch, + /*seq_length=*/seq_length, + /*a_seq_length_dim=*/a_seq_length_dim, + /*b_seq_length_dim=*/b_seq_length_dim); + } +} + +void backward_kernel(device_stream_t const &stream, + device_handle_t const &handle, + float const *o_ptr, + float const *o_grad_ptr, + float const *a_ptr, + float *a_grad_ptr, + float const *b_ptr, + float *b_grad_ptr, + int m, + int n, + int k, + int batch) { + if (stream.is_gpu()) { + gpu_backward_kernel( + /*stream=*/stream.require_gpu(), + /*handle=*/handle.require_for_gpu(), + /*o_ptr=*/o_ptr, + /*o_grad_ptr=*/o_grad_ptr, + /*a_ptr=*/a_ptr, + /*a_grad_ptr=*/a_grad_ptr, + /*b_ptr=*/b_ptr, + /*b_grad_ptr=*/b_grad_ptr, + /*m=*/m, + /*n=*/n, + /*k=*/k, + /*batch=*/batch); + } else { + ASSERT(stream.is_cpu()); + ASSERT(handle.is_for_cpu()); + cpu_backward_kernel( + /*o_ptr=*/o_ptr, + /*o_grad_ptr=*/o_grad_ptr, + /*a_ptr=*/a_ptr, + /*a_grad_ptr=*/a_grad_ptr, + /*b_ptr=*/b_ptr, + /*b_grad_ptr=*/b_grad_ptr, + /*m=*/m, + /*n=*/n, + /*k=*/k, + /*batch=*/batch); + } +} + +} // namespace FlexFlow::Kernels::BatchMatmul diff --git a/lib/kernels/src/kernels/batch_matmul_kernels_cpu.cc b/lib/kernels/src/kernels/batch_matmul_kernels_cpu.cc new file mode 100644 index 0000000000..f139d42992 --- /dev/null +++ b/lib/kernels/src/kernels/batch_matmul_kernels_cpu.cc @@ -0,0 +1,31 @@ +#include "kernels/batch_matmul_kernels_cpu.h" + +namespace FlexFlow::Kernels::BatchMatmul { + +void cpu_forward_kernel(float *output_ptr, + float const *a_input_ptr, + float const *b_input_ptr, + int m, + int n, + int k, + int batch, + int seq_length, + int a_seq_length_dim, + int b_seq_length_dim) { + NOT_IMPLEMENTED(); +} + +void cpu_backward_kernel(float const *o_ptr, + float const *o_grad_ptr, + float const *a_ptr, + float *a_grad_ptr, + float const *b_ptr, + float *b_grad_ptr, + int m, + int n, + int k, + int batch) { + NOT_IMPLEMENTED(); +} + +} // namespace FlexFlow::Kernels::BatchMatmul diff --git a/lib/kernels/src/kernels/batch_norm_kernels.cc b/lib/kernels/src/kernels/batch_norm_kernels.cc new file mode 100644 index 0000000000..e23f6a89e2 --- /dev/null +++ b/lib/kernels/src/kernels/batch_norm_kernels.cc @@ -0,0 +1,107 @@ +#include "kernels/batch_norm_kernels.h" +#include "kernels/batch_norm_kernels_cpu.h" +#include "kernels/batch_norm_kernels_gpu.h" + +namespace FlexFlow::Kernels::BatchNorm { + +std::optional + init_kernel(DeviceType device_type, + device_handle_t const &handle, + Allocator &allocator, + float *runningMean, + int output_n, + int output_c, + int output_h, + int output_w, + bool relu) { + if (device_type == DeviceType::GPU) { + return gpu_init_kernel( + /*handle=*/handle.require_for_gpu(), + /*allocator=*/allocator, + /*runningMean=*/runningMean, + /*output_n=*/output_n, + /*output_c=*/output_c, + /*output_h=*/output_h, + /*output_w=*/output_w, + /*relu=*/relu); + } else { + ASSERT(device_type == DeviceType::CPU); + ASSERT(handle.is_for_cpu()); + return std::nullopt; + } +} + +void forward_kernel(device_stream_t const &stream, + BatchNormPerDeviceState const &per_device_state, + float const *input_ptr, + float *output_ptr, + float const *scale_ptr, + float const *bias_ptr) { + if (stream.is_gpu()) { + gpu_forward_kernel( + /*stream=*/stream.require_gpu(), + /*per_device_state=*/per_device_state, + /*input_ptr=*/input_ptr, + /*output_ptr=*/output_ptr, + /*scale_ptr=*/scale_ptr, + /*bias_ptr=*/bias_ptr); + } else { + ASSERT(stream.is_cpu()); + cpu_forward_kernel( + /*per_device_state=*/per_device_state, + /*input_ptr=*/input_ptr, + /*output_ptr=*/output_ptr, + /*scale_ptr=*/scale_ptr, + /*bias_ptr=*/bias_ptr); + } +} + +void backward_kernel(device_stream_t const &stream, + BatchNormPerDeviceState const &per_device_state, + float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, + float const *scale_ptr, + float *scale_grad_ptr, + float *bias_grad_ptr, + size_t numElements) { + if (stream.is_gpu()) { + gpu_backward_kernel( + /*stream=*/stream.require_gpu(), + /*per_device_state=*/per_device_state, + /*output_ptr=*/output_ptr, + /*output_grad_ptr=*/output_grad_ptr, + /*input_ptr=*/input_ptr, + /*input_grad_ptr=*/input_grad_ptr, + /*scale_ptr=*/scale_ptr, + /*scale_grad_ptr=*/scale_grad_ptr, + /*bias_grad_ptr=*/bias_grad_ptr, + /*numElements=*/numElements); + } else { + ASSERT(stream.is_cpu()); + cpu_backward_kernel( + /*per_device_state=*/per_device_state, + /*output_ptr=*/output_ptr, + /*output_grad_ptr=*/output_grad_ptr, + /*input_ptr=*/input_ptr, + /*input_grad_ptr=*/input_grad_ptr, + /*scale_ptr=*/scale_ptr, + /*scale_grad_ptr=*/scale_grad_ptr, + /*bias_grad_ptr=*/bias_grad_ptr, + /*numElements=*/numElements); + } +} + +void cleanup_kernel(DeviceType device_type, + Allocator &allocator, + std::optional &per_device_state) { + if (device_type == DeviceType::GPU) { + gpu_cleanup_kernel(allocator, per_device_state.value()); + } else { + ASSERT(device_type == DeviceType::CPU); + ASSERT(per_device_state == std::nullopt); + } +} + +} // namespace FlexFlow::Kernels::BatchNorm diff --git a/lib/kernels/src/kernels/batch_norm_kernels_cpu.cc b/lib/kernels/src/kernels/batch_norm_kernels_cpu.cc new file mode 100644 index 0000000000..be440ac4ea --- /dev/null +++ b/lib/kernels/src/kernels/batch_norm_kernels_cpu.cc @@ -0,0 +1,25 @@ +#include "kernels/batch_norm_kernels_cpu.h" + +namespace FlexFlow::Kernels::BatchNorm { + +void cpu_forward_kernel(BatchNormPerDeviceState const &per_device_state, + float const *input_ptr, + float *output_ptr, + float const *scale_ptr, + float const *bias_ptr) { + NOT_IMPLEMENTED(); +} + +void cpu_backward_kernel(BatchNormPerDeviceState const &per_device_state, + float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, + float const *scale_ptr, + float *scale_grad_ptr, + float *bias_grad_ptr, + size_t numElements) { + NOT_IMPLEMENTED(); +} + +} // namespace FlexFlow::Kernels::BatchNorm diff --git a/lib/kernels/src/kernels/cast_kernels.cc b/lib/kernels/src/kernels/cast_kernels.cc new file mode 100644 index 0000000000..2c668c42b2 --- /dev/null +++ b/lib/kernels/src/kernels/cast_kernels.cc @@ -0,0 +1,39 @@ +#include "kernels/cast_kernels.h" +#include "kernels/cast_kernels_cpu.h" +#include "kernels/cast_kernels_gpu.h" + +namespace FlexFlow::Kernels::Cast { + +void forward_kernel(device_stream_t const &stream, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + if (stream.is_gpu()) { + gpu_forward_kernel( + /*stream=*/stream.require_gpu(), + /*input=*/input, + /*output=*/output); + } else { + ASSERT(stream.is_cpu()); + cpu_forward_kernel( + /*input=*/input, + /*output=*/output); + } +} + +void backward_kernel(device_stream_t const &stream, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorW const &input_grad) { + if (stream.is_gpu()) { + gpu_backward_kernel( + /*stream=*/stream.require_gpu(), + /*output_grad=*/output_grad, + /*input_grad=*/input_grad); + } else { + ASSERT(stream.is_cpu()); + cpu_backward_kernel( + /*output_grad=*/output_grad, + /*input_grad=*/input_grad); + } +} + +} // namespace FlexFlow::Kernels::Cast diff --git a/lib/kernels/src/cpu/ops/cast_kernels.cc b/lib/kernels/src/kernels/cast_kernels_cpu.cc similarity index 82% rename from lib/kernels/src/cpu/ops/cast_kernels.cc rename to lib/kernels/src/kernels/cast_kernels_cpu.cc index 08a98f165b..f943fa142b 100644 --- a/lib/kernels/src/cpu/ops/cast_kernels.cc +++ b/lib/kernels/src/kernels/cast_kernels_cpu.cc @@ -21,7 +21,7 @@ template struct CPUForwardKernel { void operator()(GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { - size_t volume = input.shape.num_elements().int_from_positive_int(); + size_t volume = get_num_elements(input.shape.dims).int_from_positive_int(); cpu_cast_forward(input.get(), output.get(), volume); } }; @@ -30,7 +30,7 @@ template struct CPUBackwardKernel { void operator()(GenericTensorAccessorR const &output, GenericTensorAccessorW const &input) { - size_t volume = output.shape.num_elements().int_from_positive_int(); + size_t volume = get_num_elements(output.shape.dims).int_from_positive_int(); cpu_cast_backward( output.get(), input.get(), volume, cast_to(1.0f)); } @@ -39,13 +39,13 @@ struct CPUBackwardKernel { void cpu_forward_kernel(GenericTensorAccessorR const &input, GenericTensorAccessorW const &output) { DataTypeDispatch2{}( - input.data_type, output.data_type, input, output); + input.shape.data_type, output.shape.data_type, input, output); } void cpu_backward_kernel(GenericTensorAccessorR const &output, GenericTensorAccessorW const &input) { DataTypeDispatch2{}( - output.data_type, input.data_type, output, input); + output.shape.data_type, input.shape.data_type, output, input); } } // namespace FlexFlow::Kernels::Cast diff --git a/lib/kernels/src/kernels/concat_kernels.cc b/lib/kernels/src/kernels/concat_kernels.cc new file mode 100644 index 0000000000..8d0c3112aa --- /dev/null +++ b/lib/kernels/src/kernels/concat_kernels.cc @@ -0,0 +1,45 @@ +#include "kernels/concat_kernels.h" +#include "kernels/concat_kernels_cpu.h" +#include "kernels/concat_kernels_gpu.h" + +namespace FlexFlow::Kernels::Concat { + +void forward_kernel(device_stream_t const &stream, + GenericTensorAccessorW const &output, + std::vector const &inputs, + ff_dim_t axis) { + if (stream.is_gpu()) { + gpu_forward_kernel( + /*stream=*/stream.require_gpu(), + /*output=*/output, + /*inputs=*/inputs, + /*axis=*/axis); + } else { + ASSERT(stream.is_cpu()); + cpu_forward_kernel( + /*output=*/output, + /*inputs=*/inputs, + /*axis=*/axis); + } +} + +void backward_kernel(device_stream_t const &stream, + GenericTensorAccessorR const &output_grad, + std::vector const &input_grads, + ff_dim_t axis) { + if (stream.is_gpu()) { + gpu_backward_kernel( + /*stream=*/stream.require_gpu(), + /*output_grad=*/output_grad, + /*input_grads=*/input_grads, + /*axis=*/axis); + } else { + ASSERT(stream.is_cpu()); + cpu_backward_kernel( + /*output_grad=*/output_grad, + /*input_grads=*/input_grads, + /*axis=*/axis); + } +} + +} // namespace FlexFlow::Kernels::Concat diff --git a/lib/kernels/src/kernels/concat_kernels_cpu.cc b/lib/kernels/src/kernels/concat_kernels_cpu.cc new file mode 100644 index 0000000000..03bbff67bb --- /dev/null +++ b/lib/kernels/src/kernels/concat_kernels_cpu.cc @@ -0,0 +1,17 @@ +#include "kernels/concat_kernels_cpu.h" + +namespace FlexFlow::Kernels::Concat { + +void cpu_forward_kernel(GenericTensorAccessorW const &output, + std::vector const &inputs, + ff_dim_t axis) { + NOT_IMPLEMENTED(); +} + +void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, + std::vector const &input_grads, + ff_dim_t axis) { + NOT_IMPLEMENTED(); +} + +} // namespace FlexFlow::Kernels::Concat diff --git a/lib/kernels/src/kernels/conv_2d_kernels.cc b/lib/kernels/src/kernels/conv_2d_kernels.cc new file mode 100644 index 0000000000..3008e7d1c0 --- /dev/null +++ b/lib/kernels/src/kernels/conv_2d_kernels.cc @@ -0,0 +1,118 @@ +#include "kernels/conv_2d_kernels.h" +#include "kernels/conv_2d_kernels_cpu.h" +#include "kernels/conv_2d_kernels_gpu.h" + +namespace FlexFlow::Kernels::Conv2D { + +std::optional + init_kernel(DeviceType device_type, + device_handle_t const &handle, + std::optional activation, + int kernel_h, + int kernel_w, + int groups, + int padding_h, + int padding_w, + int stride_h, + int stride_w, + GenericTensorAccessorW const &input, + GenericTensorAccessorW const &output, + float const *filter_ptr, + float *filter_grad_ptr) { + if (device_type == DeviceType::GPU) { + return gpu_init_kernel( + /*handle=*/handle.require_for_gpu(), + /*activation=*/activation, + /*kernel_h=*/kernel_h, + /*kernel_w=*/kernel_w, + /*groups=*/groups, + /*padding_h=*/padding_h, + /*padding_w=*/padding_w, + /*stride_h=*/stride_h, + /*stride_w=*/stride_w, + /*input=*/input, + /*output=*/output, + /*filter_ptr=*/filter_ptr, + /*filter_grad_ptr=*/filter_grad_ptr); + } else { + ASSERT(device_type == DeviceType::CPU); + ASSERT(handle.is_for_cpu()); + return std::nullopt; + } +} + +void forward_kernel(device_stream_t const &stream, + std::optional const &per_device_state, + float const *input_ptr, + float *output_ptr, + float const *filter_ptr, + float const *bias_ptr, + std::optional activation) { + if (stream.is_gpu()) { + gpu_forward_kernel( + /*stream=*/stream.require_gpu(), + /*per_device_state=*/per_device_state.value(), + /*input_ptr=*/input_ptr, + /*output_ptr=*/output_ptr, + /*filter_ptr=*/filter_ptr, + /*bias_ptr=*/bias_ptr, + /*activation=*/activation); + } else { + ASSERT(stream.is_cpu()); + cpu_forward_kernel( + /*input_ptr=*/input_ptr, + /*output_ptr=*/output_ptr, + /*filter_ptr=*/filter_ptr, + /*bias_ptr=*/bias_ptr, + /*activation=*/activation); + } +} + +void backward_kernel( + device_stream_t const &stream, + std::optional const &per_device_state, + float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, + float const *filter_ptr, + float *filter_grad_ptr, + float *bias_grad_ptr, + std::optional activation) { + if (stream.is_gpu()) { + gpu_backward_kernel( + /*stream=*/stream.require_gpu(), + /*per_device_state=*/per_device_state.value(), + /*output_ptr=*/output_ptr, + /*output_grad_ptr=*/output_grad_ptr, + /*input_ptr=*/input_ptr, + /*input_grad_ptr=*/input_grad_ptr, + /*filter_ptr=*/filter_ptr, + /*filter_grad_ptr=*/filter_grad_ptr, + /*bias_grad_ptr=*/bias_grad_ptr, + /*activation=*/activation); + } else { + ASSERT(stream.is_cpu()); + cpu_backward_kernel( + /*output_ptr=*/output_ptr, + /*output_grad_ptr=*/output_grad_ptr, + /*input_ptr=*/input_ptr, + /*input_grad_ptr=*/input_grad_ptr, + /*filter_ptr=*/filter_ptr, + /*filter_grad_ptr=*/filter_grad_ptr, + /*bias_grad_ptr=*/bias_grad_ptr, + /*activation=*/activation); + } +} + +void cleanup_kernel(DeviceType device_type, + std::optional &per_device_state) { + if (device_type == DeviceType::GPU) { + gpu_cleanup_kernel(per_device_state.value()); + } else { + ASSERT(device_type == DeviceType::CPU); + ASSERT(per_device_state == std::nullopt); + } +} + +} // namespace FlexFlow::Kernels::Conv2D diff --git a/lib/kernels/src/kernels/conv_2d_kernels_cpu.cc b/lib/kernels/src/kernels/conv_2d_kernels_cpu.cc new file mode 100644 index 0000000000..c595ecb586 --- /dev/null +++ b/lib/kernels/src/kernels/conv_2d_kernels_cpu.cc @@ -0,0 +1,25 @@ +#include "kernels/conv_2d_kernels_cpu.h" +#include "utils/exception.h" + +namespace FlexFlow::Kernels::Conv2D { + +void cpu_forward_kernel(float const *input_ptr, + float *output_ptr, + float const *filter_ptr, + float const *bias_ptr, + std::optional const &activation) { + NOT_IMPLEMENTED(); +} + +void cpu_backward_kernel(float const *output_ptr, + float *output_grad_ptr, + float const *input_ptr, + float *input_grad_ptr, + float const *filter_ptr, + float *filter_grad_ptr, + float *bias_grad_ptr, + std::optional const &activation) { + NOT_IMPLEMENTED(); +} + +} // namespace FlexFlow::Kernels::Conv2D diff --git a/lib/kernels/src/kernels/copy_tensor_accessor.cc b/lib/kernels/src/kernels/copy_tensor_accessor.cc index d8619d8ce6..2989a3d227 100644 --- a/lib/kernels/src/kernels/copy_tensor_accessor.cc +++ b/lib/kernels/src/kernels/copy_tensor_accessor.cc @@ -7,9 +7,8 @@ template struct CopyTensorAccessorW { GenericTensorAccessorW operator()(GenericTensorAccessorW const &src_accessor, Allocator &allocator) { - TensorShape shape = - get_tensor_shape(src_accessor.shape, src_accessor.data_type); - GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + GenericTensorAccessorW dst_accessor = + allocator.allocate_tensor(src_accessor.shape); copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); @@ -21,28 +20,27 @@ GenericTensorAccessorW copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor, Allocator &allocator) { return DataTypeDispatch1{}( - src_accessor.data_type, src_accessor, allocator); + src_accessor.shape.data_type, src_accessor, allocator); } template struct CopyTensorAccessorR { - GenericTensorAccessorR operator()(GenericTensorAccessorR const &src_accessor, + GenericTensorAccessorW operator()(GenericTensorAccessorR const &src_accessor, Allocator &allocator) { - TensorShape shape = - get_tensor_shape(src_accessor.shape, src_accessor.data_type); - GenericTensorAccessorW dst_accessor = allocator.allocate_tensor(shape); + GenericTensorAccessorW dst_accessor = + allocator.allocate_tensor(src_accessor.shape); copy_accessor_data_to_l_from_r(dst_accessor, src_accessor); - return read_only_accessor_from_write_accessor(dst_accessor); + return dst_accessor; } }; -GenericTensorAccessorR +GenericTensorAccessorW copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor, Allocator &allocator) { return DataTypeDispatch1{}( - src_accessor.data_type, src_accessor, allocator); + src_accessor.shape.data_type, src_accessor, allocator); } GenericTensorAccessorR copy_tensor_accessor_r_to_cpu_if_necessary( diff --git a/lib/kernels/src/kernels/create_local_allocator_for_device_type.cc b/lib/kernels/src/kernels/create_local_allocator_for_device_type.cc new file mode 100644 index 0000000000..ea5f054d1b --- /dev/null +++ b/lib/kernels/src/kernels/create_local_allocator_for_device_type.cc @@ -0,0 +1,16 @@ +#include "kernels/create_local_allocator_for_device_type.h" +#include "kernels/local_cpu_allocator.h" +#include "kernels/local_cuda_allocator.h" + +namespace FlexFlow { + +Allocator create_local_allocator_for_device_type(DeviceType device_type) { + if (device_type == DeviceType::GPU) { + return create_local_cuda_memory_allocator(); + } else { + ASSERT(device_type == DeviceType::CPU); + return create_local_cpu_memory_allocator(); + } +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/device_handle_t.cc b/lib/kernels/src/kernels/device_handle_t.cc new file mode 100644 index 0000000000..85f9e2a388 --- /dev/null +++ b/lib/kernels/src/kernels/device_handle_t.cc @@ -0,0 +1,24 @@ +#include "kernels/device_handle_t.h" + +namespace FlexFlow { + +device_handle_t device_handle_t_from_managed_handle( + std::optional const &managed_handle) { + if (managed_handle.has_value()) { + return gpu_make_device_handle_t(managed_handle.value().raw_handle()); + } else { + return cpu_make_device_handle_t(); + } +} + +device_handle_t gpu_make_device_handle_t(PerDeviceFFHandle const &ff_handle) { + return device_handle_t{ + ff_handle, + }; +} + +device_handle_t cpu_make_device_handle_t() { + return device_handle_t{std::monostate{}}; +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/device_stream_t.cc b/lib/kernels/src/kernels/device_stream_t.cc new file mode 100644 index 0000000000..8efa54d8bd --- /dev/null +++ b/lib/kernels/src/kernels/device_stream_t.cc @@ -0,0 +1,25 @@ +#include "kernels/device_stream_t.h" +#include + +namespace FlexFlow { + +device_stream_t get_gpu_device_stream() { + ffStream_t stream; + checkCUDA(get_legion_stream(&stream)); + return device_stream_t{stream}; +} + +device_stream_t get_cpu_device_stream() { + return device_stream_t{std::monostate{}}; +} + +device_stream_t get_stream_for_device_type(DeviceType device_type) { + if (device_type == DeviceType::GPU) { + return get_gpu_device_stream(); + } else { + ASSERT(device_type == DeviceType::CPU); + return get_cpu_device_stream(); + } +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/dropout_kernels.cc b/lib/kernels/src/kernels/dropout_kernels.cc new file mode 100644 index 0000000000..ae1f3d4c0e --- /dev/null +++ b/lib/kernels/src/kernels/dropout_kernels.cc @@ -0,0 +1,79 @@ +#include "kernels/dropout_kernels.h" +#include "kernels/dropout_kernels_cpu.h" +#include "kernels/dropout_kernels_gpu.h" + +namespace FlexFlow::Kernels::Dropout { + +std::optional + init_kernel(DeviceType device_type, + device_handle_t const &handle, + float rate, + unsigned long long seed, + TensorShape const &output_shape, + Allocator &allocator) { + if (device_type == DeviceType::GPU) { + return gpu_init_kernel( + /*handle=*/handle.require_for_gpu(), + /*rate=*/rate, + /*seed=*/seed, + /*output_shape=*/output_shape, + /*allocator=*/allocator); + } else { + ASSERT(device_type == DeviceType::CPU); + ASSERT(handle.is_for_cpu()); + return std::nullopt; + } +} + +void forward_kernel( + device_stream_t const &stream, + std::optional const &per_device_state, + float const *input_ptr, + float *output_ptr) { + if (stream.is_gpu()) { + gpu_forward_kernel( + /*stream=*/stream.require_gpu(), + /*per_device_state=*/per_device_state.value(), + /*input_ptr=*/input_ptr, + /*output_ptr=*/output_ptr); + } else { + ASSERT(stream.is_cpu()); + ASSERT(per_device_state == std::nullopt); + cpu_forward_kernel( + /*input_ptr=*/input_ptr, + /*output_ptr=*/output_ptr); + } +} + +void backward_kernel( + device_stream_t const &stream, + std::optional const &per_device_state, + float const *output_grad_ptr, + float *input_grad_ptr) { + if (stream.is_gpu()) { + gpu_backward_kernel( + /*stream=*/stream.require_gpu(), + /*per_device_state=*/per_device_state.value(), + /*output_grad_ptr=*/output_grad_ptr, + /*input_grad_ptr=*/input_grad_ptr); + } else { + ASSERT(stream.is_cpu()); + ASSERT(per_device_state == std::nullopt); + cpu_backward_kernel( + /*output_grad_ptr=*/output_grad_ptr, + /*input_grad_ptr=*/input_grad_ptr); + } +} + +void cleanup_kernel(DeviceType device_type, + Allocator &allocator, + std::optional &per_device_state) { + if (device_type == DeviceType::GPU) { + gpu_cleanup_kernel(allocator, per_device_state.value()); + } else { + ASSERT(device_type == DeviceType::CPU); + ASSERT(per_device_state == std::nullopt); + } +} + +} // namespace FlexFlow::Kernels::Dropout diff --git a/lib/kernels/src/kernels/dropout_kernels_cpu.cc b/lib/kernels/src/kernels/dropout_kernels_cpu.cc new file mode 100644 index 0000000000..f6558af96c --- /dev/null +++ b/lib/kernels/src/kernels/dropout_kernels_cpu.cc @@ -0,0 +1,14 @@ +#include "kernels/dropout_kernels_cpu.h" +#include "utils/exception.h" + +namespace FlexFlow::Kernels::Dropout { + +void cpu_forward_kernel(float const *input_ptr, float *output_ptr) { + NOT_IMPLEMENTED(); +} + +void cpu_backward_kernel(float const *output_grad_ptr, float *input_grad_ptr) { + NOT_IMPLEMENTED(); +} + +} // namespace FlexFlow::Kernels::Dropout diff --git a/lib/kernels/src/kernels/element_binary_kernels.cc b/lib/kernels/src/kernels/element_binary_kernels.cc new file mode 100644 index 0000000000..bea317dfec --- /dev/null +++ b/lib/kernels/src/kernels/element_binary_kernels.cc @@ -0,0 +1,116 @@ +#include "kernels/element_binary_kernels.h" +#include "kernels/element_binary_kernels_cpu.h" +#include "kernels/element_binary_kernels_gpu.h" + +namespace FlexFlow::Kernels::ElementBinary { + +std::optional + init_kernel(DeviceType device_type, + device_handle_t const &handle, + OperatorType op_type, + bool should_broadcast_lhs, + bool should_broadcast_rhs, + TensorShape const &lhs_shape, + TensorShape const &rhs_shape, + TensorShape const &output_shape) { + if (device_type == DeviceType::GPU) { + return gpu_init_kernel( + /*handle=*/handle.require_for_gpu(), + /*op_type=*/op_type, + /*should_broadcast_lhs=*/should_broadcast_lhs, + /*should_broadcast_rhs=*/should_broadcast_rhs, + /*lhs_shape=*/lhs_shape, + /*rhs_shape=*/rhs_shape, + /*output_shape=*/output_shape); + } else { + ASSERT(device_type == DeviceType::CPU); + ASSERT(handle.is_for_cpu()); + return std::nullopt; + } +} + +void forward_kernel( + device_stream_t const &stream, + std::optional const &per_device_state, + float const *lhs_ptr, + float const *rhs_ptr, + float *out_ptr, + OperatorType op_type, + bool broadcast_inputLHS, + device_handle_t const &handle) { + if (stream.is_gpu()) { + gpu_forward_kernel( + /*stream=*/stream.require_gpu(), + /*per_device_state=*/per_device_state.value(), + /*lhs_ptr=*/lhs_ptr, + /*rhs_ptr=*/rhs_ptr, + /*out_ptr=*/out_ptr, + /*op_type=*/op_type, + /*broadcast_inputLHS=*/broadcast_inputLHS, + /*handle=*/handle.require_for_gpu()); + } else { + ASSERT(stream.is_cpu()); + ASSERT(per_device_state == std::nullopt); + ASSERT(handle.is_for_cpu()); + cpu_forward_kernel( + /*lhs_ptr=*/lhs_ptr, + /*rhs_ptr=*/rhs_ptr, + /*out_ptr=*/out_ptr, + /*op_type=*/op_type, + /*broadcast_inputLHS=*/broadcast_inputLHS); + } +} + +void backward_kernel( + device_stream_t const &stream, + std::optional const &per_device_state, + float const *out_grad_ptr, + float const *lhs_ptr, + float const *rhs_ptr, + float *lhs_grad_ptr, + float *rhs_grad_ptr, + OperatorType op_type, + bool broadcast_inputLHS, + bool broadcast_inputRHS, + device_handle_t const &handle) { + if (stream.is_gpu()) { + gpu_backward_kernel( + /*stream=*/stream.require_gpu(), + /*per_device_state=*/per_device_state.value(), + /*out_grad_ptr=*/out_grad_ptr, + /*lhs_ptr=*/lhs_ptr, + /*rhs_ptr=*/rhs_ptr, + /*lhs_grad_ptr=*/lhs_grad_ptr, + /*rhs_grad_ptr=*/rhs_grad_ptr, + /*op_type=*/op_type, + /*broadcast_inputLHS=*/broadcast_inputLHS, + /*broadcast_inputRHS=*/broadcast_inputRHS, + /*handle=*/handle.require_for_gpu()); + } else { + ASSERT(stream.is_cpu()); + ASSERT(per_device_state == std::nullopt); + ASSERT(handle.is_for_cpu()); + cpu_backward_kernel( + /*out_grad_ptr=*/out_grad_ptr, + /*lhs_ptr=*/lhs_ptr, + /*rhs_ptr=*/rhs_ptr, + /*lhs_grad_ptr=*/lhs_grad_ptr, + /*rhs_grad_ptr=*/rhs_grad_ptr, + /*op_type=*/op_type, + /*broadcast_inputLHS=*/broadcast_inputLHS, + /*broadcast_inputRHS=*/broadcast_inputRHS); + } +} + +void cleanup_kernel( + DeviceType device_type, + std::optional const &per_device_state) { + if (device_type == DeviceType::GPU) { + gpu_cleanup_kernel(per_device_state.value()); + } else { + ASSERT(device_type == DeviceType::CPU); + ASSERT(per_device_state == std::nullopt); + } +} + +} // namespace FlexFlow::Kernels::ElementBinary diff --git a/lib/kernels/src/kernels/element_binary_kernels_cpu.cc b/lib/kernels/src/kernels/element_binary_kernels_cpu.cc new file mode 100644 index 0000000000..cbcd98dc7e --- /dev/null +++ b/lib/kernels/src/kernels/element_binary_kernels_cpu.cc @@ -0,0 +1,25 @@ +#include "kernels/element_binary_kernels_cpu.h" +#include "utils/exception.h" + +namespace FlexFlow::Kernels::ElementBinary { + +void cpu_forward_kernel(float const *lhs_ptr, + float const *rhs_ptr, + float *out_ptr, + OperatorType op_type, + bool broadcast_inputLHS) { + NOT_IMPLEMENTED(); +} + +void cpu_backward_kernel(float const *out_grad_ptr, + float const *lhs_ptr, + float const *rhs_ptr, + float *lhs_grad_ptr, + float *rhs_grad_ptr, + OperatorType op_type, + bool broadcast_inputLHS, + bool broadcast_inputRHS) { + NOT_IMPLEMENTED(); +} + +} // namespace FlexFlow::Kernels::ElementBinary diff --git a/lib/kernels/src/kernels/element_unary_kernels.cc b/lib/kernels/src/kernels/element_unary_kernels.cc new file mode 100644 index 0000000000..ff61385336 --- /dev/null +++ b/lib/kernels/src/kernels/element_unary_kernels.cc @@ -0,0 +1,92 @@ +#include "kernels/element_unary_kernels.h" +#include "kernels/element_unary_kernels_cpu.h" +#include "kernels/element_unary_kernels_gpu.h" + +namespace FlexFlow::Kernels::ElementUnary { + +std::optional + init_kernel(DeviceType device_type, + TensorShape const &input_shape, + TensorShape const &output_shape, + ElementUnaryAttrs const &attrs) { + if (device_type == DeviceType::GPU) { + return gpu_init_kernel( + /*input_shape=*/input_shape, + /*output_shape=*/output_shape, + /*attrs=*/attrs); + } else { + ASSERT(device_type == DeviceType::CPU); + return std::nullopt; + } +} + +void forward_kernel( + device_stream_t const &stream, + std::optional const &per_device_state, + ElementUnaryAttrs const &attrs, + device_handle_t const &handle, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + if (stream.is_gpu()) { + gpu_forward_kernel( + /*stream=*/stream.require_gpu(), + /*per_device_state=*/per_device_state.value(), + /*attrs=*/attrs, + /*handle=*/handle.require_for_gpu(), + /*input=*/input, + /*output=*/output); + } else { + ASSERT(stream.is_cpu()); + ASSERT(per_device_state == std::nullopt); + ASSERT(handle.is_for_cpu()); + cpu_forward_kernel( + /*attrs=*/attrs, + /*input=*/input, + /*output=*/output); + } +} + +void backward_kernel( + device_stream_t const &stream, + std::optional const &per_device_state, + ElementUnaryAttrs const &attrs, + device_handle_t const &handle, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad) { + if (stream.is_gpu()) { + gpu_backward_kernel( + /*stream=*/stream.require_gpu(), + /*per_device_state=*/per_device_state.value(), + /*attrs=*/attrs, + /*handle=*/handle.require_for_gpu(), + /*output=*/output, + /*output_grad=*/output_grad, + /*input=*/input, + /*input_grad=*/input_grad); + } else { + ASSERT(stream.is_cpu()); + ASSERT(per_device_state == std::nullopt); + ASSERT(handle.is_for_cpu()); + cpu_backward_kernel( + /*attrs=*/attrs, + /*output=*/output, + /*output_grad=*/output_grad, + /*input=*/input, + /*input_grad=*/input_grad); + } +} + +void cleanup_kernel( + DeviceType device_type, + std::optional &per_device_state) { + if (device_type == DeviceType::GPU) { + gpu_cleanup_kernel(per_device_state.value()); + } else { + ASSERT(device_type == DeviceType::CPU); + ASSERT(per_device_state == std::nullopt); + } +} + +} // namespace FlexFlow::Kernels::ElementUnary diff --git a/lib/kernels/src/kernels/element_unary_kernels_cpu.cc b/lib/kernels/src/kernels/element_unary_kernels_cpu.cc new file mode 100644 index 0000000000..0c2f521b96 --- /dev/null +++ b/lib/kernels/src/kernels/element_unary_kernels_cpu.cc @@ -0,0 +1,19 @@ +#include "kernels/element_unary_kernels_cpu.h" + +namespace FlexFlow::Kernels::ElementUnary { + +void cpu_forward_kernel(ElementUnaryAttrs const &attrs, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + NOT_IMPLEMENTED(); +} + +void cpu_backward_kernel(ElementUnaryAttrs const &attrs, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad) { + NOT_IMPLEMENTED(); +} + +} // namespace FlexFlow::Kernels::ElementUnary diff --git a/lib/kernels/src/kernels/embedding_kernels.cc b/lib/kernels/src/kernels/embedding_kernels.cc new file mode 100644 index 0000000000..957d297b9e --- /dev/null +++ b/lib/kernels/src/kernels/embedding_kernels.cc @@ -0,0 +1,81 @@ +#include "kernels/embedding_kernels.h" +#include "kernels/embedding_kernels_cpu.h" +#include "kernels/embedding_kernels_gpu.h" + +namespace FlexFlow::Kernels::Embedding { + +void forward_kernel(device_stream_t const &stream, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + DataType input_data_type, + DataType output_data_type, + std::optional aggr, + int in_dim, + int out_dim, + int batch_size) { + if (stream.is_gpu()) { + gpu_forward_kernel( + /*stream=*/stream.require_gpu(), + /*input=*/input, + /*output=*/output, + /*weight=*/weight, + /*input_data_type=*/input_data_type, + /*output_data_type=*/output_data_type, + /*aggr=*/aggr, + /*in_dim=*/in_dim, + /*out_dim=*/out_dim, + /*batch_size=*/batch_size); + } else { + ASSERT(stream.is_cpu()); + cpu_forward_kernel( + /*input=*/input, + /*output=*/output, + /*weight=*/weight, + /*input_data_type=*/input_data_type, + /*output_data_type=*/output_data_type, + /*aggr=*/aggr, + /*in_dim=*/in_dim, + /*out_dim=*/out_dim, + /*batch_size=*/batch_size); + } +} + +void backward_kernel(device_stream_t const &stream, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + DataType output_data_type, + DataType input_data_type, + std::optional aggr, + int in_dim, + int out_dim, + int batch_size) { + if (stream.is_gpu()) { + gpu_backward_kernel( + /*stream=*/stream.require_gpu(), + /*output=*/output, + /*input=*/input, + /*weight_grad=*/weight_grad, + /*output_data_type=*/output_data_type, + /*input_data_type=*/input_data_type, + /*aggr=*/aggr, + /*in_dim=*/in_dim, + /*out_dim=*/out_dim, + /*batch_size=*/batch_size); + } else { + ASSERT(stream.is_cpu()); + cpu_backward_kernel( + /*output=*/output, + /*input=*/input, + /*weight_grad=*/weight_grad, + /*output_data_type=*/output_data_type, + /*input_data_type=*/input_data_type, + /*aggr=*/aggr, + /*in_dim=*/in_dim, + /*out_dim=*/out_dim, + /*batch_size=*/batch_size); + } +} + +} // namespace FlexFlow::Kernels::Embedding diff --git a/lib/kernels/src/kernels/embedding_kernels_cpu.cc b/lib/kernels/src/kernels/embedding_kernels_cpu.cc new file mode 100644 index 0000000000..f5df53e322 --- /dev/null +++ b/lib/kernels/src/kernels/embedding_kernels_cpu.cc @@ -0,0 +1,29 @@ +#include "kernels/embedding_kernels_cpu.h" + +namespace FlexFlow::Kernels::Embedding { + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &weight, + DataType input_data_type, + DataType output_data_type, + std::optional aggr, + int in_dim, + int out_dim, + int batch_size) { + NOT_IMPLEMENTED(); +} + +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &weight_grad, + DataType output_data_type, + DataType input_data_type, + std::optional aggr, + int in_dim, + int out_dim, + int batch_size) { + NOT_IMPLEMENTED(); +} + +} // namespace FlexFlow::Kernels::Embedding diff --git a/lib/kernels/src/kernels/fill_tensor_accessor.cc b/lib/kernels/src/kernels/fill_tensor_accessor.cc index bee8d12556..7b5a01d03f 100644 --- a/lib/kernels/src/kernels/fill_tensor_accessor.cc +++ b/lib/kernels/src/kernels/fill_tensor_accessor.cc @@ -1,11 +1,33 @@ #include "kernels/fill_tensor_accessor.h" +#include "kernels/datatype_dispatch.h" #include "op-attrs/datatype_value.h" +#include "op-attrs/tensor_shape.h" namespace FlexFlow { -void fill_tensor_accessor(GenericTensorAccessorW &accessor, DataTypeValue val) { - ASSERT(accessor.device_type == DeviceType::CPU); - ASSERT(accessor.data_type == get_data_type_of_data_type_value(val)); +template +struct FillWithZeros { + void operator()(GenericTensorAccessorW const &accessor) { + using T = real_type_t
; + + if (accessor.device_type == DeviceType::CPU) { + memset(accessor.ptr, + 0, + get_size_in_bytes(accessor.shape) + .unwrap_num_bytes() + .unwrap_nonnegative()); + } else { + checkCUDA(cudaMemset(accessor.ptr, + 0, + get_size_in_bytes(accessor.shape) + .unwrap_num_bytes() + .unwrap_nonnegative())); + } + } +}; + +void fill_with_zeros(GenericTensorAccessorW const &accessor) { + DataTypeDispatch1{}(accessor.shape.data_type, accessor); } GenericTensorAccessorW create_accessor_w_filled_with( diff --git a/lib/kernels/src/kernels/flat_kernels.cc b/lib/kernels/src/kernels/flat_kernels.cc new file mode 100644 index 0000000000..1032e081e7 --- /dev/null +++ b/lib/kernels/src/kernels/flat_kernels.cc @@ -0,0 +1,42 @@ +#include "kernels/flat_kernels.h" +#include "kernels/flat_kernels_cpu.h" +#include "kernels/flat_kernels_gpu.h" + +namespace FlexFlow::Kernels::Flat { + +void forward_kernel(device_stream_t const &stream, + GenericTensorAccessorR const &input, + float *output_ptr) { + if (stream.is_gpu()) { + gpu_forward_kernel( + /*stream=*/stream.require_gpu(), + /*input=*/input, + /*output_ptr=*/output_ptr); + } else { + ASSERT(stream.is_cpu()); + cpu_forward_kernel( + /*input=*/input, + /*output_ptr=*/output_ptr); + } +} + +void backward_kernel(device_stream_t const &stream, + GenericTensorAccessorR const &input, + float const *output_grad_ptr, + float *input_grad_ptr) { + if (stream.is_gpu()) { + gpu_backward_kernel( + /*stream=*/stream.require_gpu(), + /*input=*/input, + /*output_grad_ptr=*/output_grad_ptr, + /*input_grad_ptr=*/input_grad_ptr); + } else { + ASSERT(stream.is_cpu()); + cpu_backward_kernel( + /*input=*/input, + /*output_grad_ptr=*/output_grad_ptr, + /*input_grad_ptr=*/input_grad_ptr); + } +} + +} // namespace FlexFlow::Kernels::Flat diff --git a/lib/kernels/src/kernels/flat_kernels_cpu.cc b/lib/kernels/src/kernels/flat_kernels_cpu.cc new file mode 100644 index 0000000000..b7de8dd8ff --- /dev/null +++ b/lib/kernels/src/kernels/flat_kernels_cpu.cc @@ -0,0 +1,16 @@ +#include "kernels/flat_kernels_cpu.h" + +namespace FlexFlow::Kernels::Flat { + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + float *output_ptr) { + NOT_IMPLEMENTED(); +} + +void cpu_backward_kernel(GenericTensorAccessorR const &input, + float const *output_grad_ptr, + float *input_grad_ptr) { + NOT_IMPLEMENTED(); +} + +} // namespace FlexFlow::Kernels::Flat diff --git a/lib/kernels/src/kernels/format_accessor_contents.cc b/lib/kernels/src/kernels/format_accessor_contents.cc index d40e5c4268..cbdf2870dd 100644 --- a/lib/kernels/src/kernels/format_accessor_contents.cc +++ b/lib/kernels/src/kernels/format_accessor_contents.cc @@ -2,6 +2,7 @@ #include "kernels/copy_tensor_accessor.h" #include "kernels/datatype_dispatch.h" #include "kernels/local_cpu_allocator.h" +#include "op-attrs/tensor_shape.h" #include "utils/indent.h" #include "utils/nonnegative_int/nonnegative_range.h" #include @@ -13,17 +14,18 @@ struct Print1DCPUAccessorR { void operator()(GenericTensorAccessorR const &accessor, std::ostream &stream) { ASSERT(accessor.device_type == DeviceType::CPU); - nonnegative_int dims = accessor.shape.num_dims(); + nonnegative_int dims = get_num_dims(accessor.shape.dims); ASSERT(dims == 1_n); - positive_int ncols = accessor.shape.at(ff_dim_t{0_n}); + positive_int ncols = dim_at_idx(accessor.shape.dims, ff_dim_t{0_n}); stream << "[" << join_strings( nonnegative_range(ncols.nonnegative_int_from_positive_int()), " ", [&](nonnegative_int col_idx) -> std::string { - return fmt::to_string(accessor.at
(FFOrdered{col_idx})); + return fmt::to_string( + accessor.at
(TensorDimsCoord{FFOrdered{col_idx}})); }) << "]"; } @@ -32,10 +34,11 @@ struct Print1DCPUAccessorR { static std::string format_1d_accessor_r_contents(GenericTensorAccessorR const &accessor) { ASSERT(accessor.device_type == DeviceType::CPU); - ASSERT(accessor.shape.num_dims() == 1_n); + ASSERT(get_num_dims(accessor.shape.dims) == 1_n); std::ostringstream oss; - DataTypeDispatch1{}(accessor.data_type, accessor, oss); + DataTypeDispatch1{}( + accessor.shape.data_type, accessor, oss); return oss.str(); } @@ -44,20 +47,21 @@ struct Print2DCPUAccessorR { void operator()(GenericTensorAccessorR const &accessor, std::ostream &stream) { ASSERT(accessor.device_type == DeviceType::CPU); - nonnegative_int dims = accessor.shape.num_dims(); + nonnegative_int dims = get_num_dims(accessor.shape.dims); ASSERT(dims == 2_n); - positive_int dim0_size = accessor.shape.at(ff_dim_t{0_n}); - positive_int dim1_size = accessor.shape.at(ff_dim_t{1_n}); + positive_int dim0_size = dim_at_idx(accessor.shape.dims, ff_dim_t{0_n}); + positive_int dim1_size = dim_at_idx(accessor.shape.dims, ff_dim_t{1_n}); auto render_1d = [&](nonnegative_int dim0_idx) -> std::string { return "[" + - join_strings(nonnegative_range( - dim1_size.nonnegative_int_from_positive_int()), - " ", - [&](nonnegative_int dim1_idx) -> std::string { - return fmt::to_string( - accessor.at
(FFOrdered{dim0_idx, dim1_idx})); - }) + + join_strings( + nonnegative_range( + dim1_size.nonnegative_int_from_positive_int()), + " ", + [&](nonnegative_int dim1_idx) -> std::string { + return fmt::to_string(accessor.at
( + TensorDimsCoord{FFOrdered{dim0_idx, dim1_idx}})); + }) + "]"; }; @@ -74,10 +78,11 @@ struct Print2DCPUAccessorR { static std::string format_2d_accessor_r_contents(GenericTensorAccessorR const &accessor) { ASSERT(accessor.device_type == DeviceType::CPU); - ASSERT(accessor.shape.num_dims() == 2_n); + ASSERT(get_num_dims(accessor.shape.dims) == 2_n); std::ostringstream oss; - DataTypeDispatch1{}(accessor.data_type, accessor, oss); + DataTypeDispatch1{}( + accessor.shape.data_type, accessor, oss); return oss.str(); } @@ -86,12 +91,12 @@ struct Print3DCPUAccessorR { void operator()(GenericTensorAccessorR const &accessor, std::ostream &stream) { ASSERT(accessor.device_type == DeviceType::CPU); - nonnegative_int dims = accessor.shape.num_dims(); + nonnegative_int dims = get_num_dims(accessor.shape.dims); ASSERT(dims == 3_n); - positive_int dim0_size = accessor.shape.at(ff_dim_t{0_n}); - positive_int dim1_size = accessor.shape.at(ff_dim_t{1_n}); - positive_int dim2_size = accessor.shape.at(ff_dim_t{2_n}); + positive_int dim0_size = dim_at_idx(accessor.shape.dims, ff_dim_t{0_n}); + positive_int dim1_size = dim_at_idx(accessor.shape.dims, ff_dim_t{1_n}); + positive_int dim2_size = dim_at_idx(accessor.shape.dims, ff_dim_t{2_n}); auto render_1d = [&](nonnegative_int dim0_idx, nonnegative_int dim1_idx) -> std::string { @@ -100,8 +105,9 @@ struct Print3DCPUAccessorR { dim2_size.nonnegative_int_from_positive_int()), " ", [&](nonnegative_int dim2_idx) -> std::string { - return fmt::to_string(accessor.at
( - FFOrdered{dim0_idx, dim1_idx, dim2_idx})); + return fmt::to_string( + accessor.at
(TensorDimsCoord{ + FFOrdered{dim0_idx, dim1_idx, dim2_idx}})); }) + "]"; }; @@ -131,10 +137,85 @@ struct Print3DCPUAccessorR { static std::string format_3d_accessor_r_contents(GenericTensorAccessorR const &accessor) { ASSERT(accessor.device_type == DeviceType::CPU); - ASSERT(accessor.shape.num_dims() == 3_n); + ASSERT(get_num_dims(accessor.shape.dims) == 3_n); std::ostringstream oss; - DataTypeDispatch1{}(accessor.data_type, accessor, oss); + DataTypeDispatch1{}( + accessor.shape.data_type, accessor, oss); + return oss.str(); +} + +template +struct Print4DCPUAccessorR { + void operator()(GenericTensorAccessorR const &accessor, + std::ostream &stream) { + ASSERT(accessor.device_type == DeviceType::CPU); + nonnegative_int dims = get_num_dims(accessor.shape.dims); + ASSERT(dims == 4_n); + + positive_int dim0_size = dim_at_idx(accessor.shape.dims, ff_dim_t{0_n}); + positive_int dim1_size = dim_at_idx(accessor.shape.dims, ff_dim_t{1_n}); + positive_int dim2_size = dim_at_idx(accessor.shape.dims, ff_dim_t{2_n}); + positive_int dim3_size = dim_at_idx(accessor.shape.dims, ff_dim_t{3_n}); + + auto render_1d = [&](nonnegative_int dim0_idx, + nonnegative_int dim1_idx, + nonnegative_int dim2_idx) -> std::string { + return "[" + + join_strings( + nonnegative_range( + dim3_size.nonnegative_int_from_positive_int()), + " ", + [&](nonnegative_int dim3_idx) -> std::string { + return fmt::to_string(accessor.at
(TensorDimsCoord{ + FFOrdered{dim0_idx, dim1_idx, dim2_idx, dim3_idx}})); + }) + + "]"; + }; + + auto render_2d = [&](nonnegative_int dim0_idx, + nonnegative_int dim1_idx) -> std::string { + return "[\n" + + indent(join_strings( + nonnegative_range( + dim2_size.nonnegative_int_from_positive_int()), + "\n", + [&](nonnegative_int dim2_idx) -> std::string { + return render_1d(dim0_idx, dim1_idx, dim2_idx); + })) + + "\n]"; + }; + + auto render_3d = [&](nonnegative_int dim0_idx) -> std::string { + return "[\n" + + indent(join_strings( + nonnegative_range( + dim1_size.nonnegative_int_from_positive_int()), + "\n", + [&](nonnegative_int dim1_idx) -> std::string { + return render_2d(dim0_idx, dim1_idx); + })) + + "\n]"; + }; + + stream << "[\n" + << indent(join_strings( + nonnegative_range( + dim0_size.nonnegative_int_from_positive_int()), + "\n", + render_3d)) + << "\n]"; + } +}; + +static std::string + format_4d_accessor_r_contents(GenericTensorAccessorR const &accessor) { + ASSERT(accessor.device_type == DeviceType::CPU); + ASSERT(get_num_dims(accessor.shape.dims) == 4_n); + + std::ostringstream oss; + DataTypeDispatch1{}( + accessor.shape.data_type, accessor, oss); return oss.str(); } @@ -156,12 +237,18 @@ static std::string read_only_accessor_from_write_accessor(accessor)); } +static std::string + format_4d_accessor_w_contents(GenericTensorAccessorW const &accessor) { + return format_4d_accessor_r_contents( + read_only_accessor_from_write_accessor(accessor)); +} + std::string format_accessor_r_contents(GenericTensorAccessorR const &accessor) { Allocator cpu_allocator = create_local_cpu_memory_allocator(); GenericTensorAccessorR cpu_accessor = copy_tensor_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator); - int num_dims = cpu_accessor.shape.num_dims().unwrap_nonnegative(); + int num_dims = get_num_dims(cpu_accessor.shape.dims).unwrap_nonnegative(); switch (num_dims) { case 1: return format_1d_accessor_r_contents(cpu_accessor); @@ -169,6 +256,8 @@ std::string format_accessor_r_contents(GenericTensorAccessorR const &accessor) { return format_2d_accessor_r_contents(cpu_accessor); case 3: return format_3d_accessor_r_contents(cpu_accessor); + case 4: + return format_4d_accessor_r_contents(cpu_accessor); default: PANIC("Unhandled accessor dimensionality", num_dims); } @@ -179,7 +268,7 @@ std::string format_accessor_w_contents(GenericTensorAccessorW const &accessor) { GenericTensorAccessorW cpu_accessor = copy_tensor_accessor_w_to_cpu_if_necessary(accessor, cpu_allocator); - int num_dims = cpu_accessor.shape.num_dims().unwrap_nonnegative(); + int num_dims = get_num_dims(cpu_accessor.shape.dims).unwrap_nonnegative(); switch (num_dims) { case 1: return format_1d_accessor_w_contents(cpu_accessor); @@ -187,6 +276,8 @@ std::string format_accessor_w_contents(GenericTensorAccessorW const &accessor) { return format_2d_accessor_w_contents(cpu_accessor); case 3: return format_3d_accessor_w_contents(cpu_accessor); + case 4: + return format_4d_accessor_w_contents(cpu_accessor); default: PANIC("Unhandled accessor dimensionality", num_dims); } diff --git a/lib/kernels/src/kernels/gather_kernels.cc b/lib/kernels/src/kernels/gather_kernels.cc new file mode 100644 index 0000000000..a21d132511 --- /dev/null +++ b/lib/kernels/src/kernels/gather_kernels.cc @@ -0,0 +1,66 @@ +#include "kernels/gather_kernels.h" +#include "kernels/gather_kernels_cpu.h" +#include "kernels/gather_kernels_gpu.h" + +namespace FlexFlow::Kernels::Gather { + +std::optional init_kernel(DeviceType device_type, + device_handle_t const &handle, + ff_dim_t dim) { + if (device_type == DeviceType::GPU) { + return gpu_init_kernel( + /*handle=*/handle.require_for_gpu(), + /*dim=*/dim); + } else { + ASSERT(device_type == DeviceType::CPU); + ASSERT(handle.is_for_cpu()); + return std::nullopt; + } +} + +void forward_kernel(device_stream_t const &stream, + std::optional const &per_device_state, + GenericTensorAccessorR const &input, + GenericTensorAccessorR const &index, + GenericTensorAccessorW const &output) { + if (stream.is_gpu()) { + gpu_forward_kernel( + /*stream=*/stream.require_gpu(), + /*per_device_state=*/per_device_state.value(), + /*input=*/input, + /*index=*/index, + /*output=*/output); + } else { + ASSERT(stream.is_cpu()); + ASSERT(per_device_state == std::nullopt); + cpu_forward_kernel( + /*input=*/input, + /*index=*/index, + /*output=*/output); + } +} + +void backward_kernel( + device_stream_t const &stream, + std::optional const &per_device_state, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &index, + GenericTensorAccessorW const &input_grad) { + if (stream.is_gpu()) { + gpu_backward_kernel( + /*stream=*/stream.require_gpu(), + /*per_device_state=*/per_device_state.value(), + /*output_grad=*/output_grad, + /*index=*/index, + /*input_grad=*/input_grad); + } else { + ASSERT(stream.is_cpu()); + ASSERT(per_device_state == std::nullopt); + cpu_backward_kernel( + /*output_grad=*/output_grad, + /*index=*/index, + /*input_grad=*/input_grad); + } +} + +} // namespace FlexFlow::Kernels::Gather diff --git a/lib/kernels/src/kernels/gather_kernels_cpu.cc b/lib/kernels/src/kernels/gather_kernels_cpu.cc new file mode 100644 index 0000000000..ed216802b3 --- /dev/null +++ b/lib/kernels/src/kernels/gather_kernels_cpu.cc @@ -0,0 +1,17 @@ +#include "kernels/gather_kernels_cpu.h" + +namespace FlexFlow::Kernels::Gather { + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorR const &index, + GenericTensorAccessorW const &output) { + NOT_IMPLEMENTED(); +} + +void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &index, + GenericTensorAccessorW const &input_grad) { + NOT_IMPLEMENTED(); +} + +} // namespace FlexFlow::Kernels::Gather diff --git a/lib/kernels/src/kernels/layer_norm_kernels.cc b/lib/kernels/src/kernels/layer_norm_kernels.cc new file mode 100644 index 0000000000..3db6ec734c --- /dev/null +++ b/lib/kernels/src/kernels/layer_norm_kernels.cc @@ -0,0 +1,99 @@ +#include "kernels/layer_norm_kernels.h" +#include "kernels/layer_norm_kernels_cpu.h" +#include "kernels/layer_norm_kernels_gpu.h" + +namespace FlexFlow::Kernels::LayerNorm { + +std::optional + init_kernel(DeviceType device_type, + device_handle_t const &handle, + Allocator &allocator, + bool elementwise_affine, + int64_t effective_batch_size, + int64_t effective_num_elements, + float eps) { + if (device_type == DeviceType::GPU) { + return gpu_init_kernel( + /*handle=*/handle.require_for_gpu(), + /*allocator=*/allocator, + /*elementwise_affine=*/elementwise_affine, + /*effective_batch_size=*/effective_batch_size, + /*effective_num_elements=*/effective_num_elements, + /*eps=*/eps); + } else { + ASSERT(device_type == DeviceType::CPU); + ASSERT(handle.is_for_cpu()); + return std::nullopt; + } +} + +void forward_kernel( + device_stream_t const &stream, + std::optional const &per_device_state, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorW const &gamma, + GenericTensorAccessorW const &beta) { + if (stream.is_gpu()) { + gpu_forward_kernel( + /*stream=*/stream.require_gpu(), + /*per_device_state=*/per_device_state.value(), + /*input=*/input, + /*output=*/output, + /*gamma=*/gamma, + /*beta=*/beta); + } else { + ASSERT(stream.is_cpu()); + ASSERT(per_device_state == std::nullopt); + cpu_forward_kernel( + /*input=*/input, + /*output=*/output, + /*gamma=*/gamma, + /*beta=*/beta); + } +} + +void backward_kernel( + device_stream_t const &stream, + std::optional const &per_device_state, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad) { + if (stream.is_gpu()) { + gpu_backward_kernel( + /*stream=*/stream.require_gpu(), + /*per_device_state=*/per_device_state.value(), + /*output_grad=*/output_grad, + /*input=*/input, + /*input_grad=*/input_grad, + /*gamma=*/gamma, + /*gamma_grad=*/gamma_grad, + /*beta_grad=*/beta_grad); + } else { + ASSERT(stream.is_cpu()); + ASSERT(per_device_state == std::nullopt); + cpu_backward_kernel( + /*output_grad=*/output_grad, + /*input=*/input, + /*input_grad=*/input_grad, + /*gamma=*/gamma, + /*gamma_grad=*/gamma_grad, + /*beta_grad=*/beta_grad); + } +} + +void cleanup_kernel( + DeviceType device_type, + std::optional const &per_device_state) { + if (device_type == DeviceType::GPU) { + gpu_cleanup_kernel(per_device_state.value()); + } else { + ASSERT(device_type == DeviceType::CPU); + ASSERT(per_device_state == std::nullopt); + } +} + +} // namespace FlexFlow::Kernels::LayerNorm diff --git a/lib/kernels/src/kernels/layer_norm_kernels_cpu.cc b/lib/kernels/src/kernels/layer_norm_kernels_cpu.cc new file mode 100644 index 0000000000..f6922f7cf4 --- /dev/null +++ b/lib/kernels/src/kernels/layer_norm_kernels_cpu.cc @@ -0,0 +1,21 @@ +#include "kernels/layer_norm_kernels_cpu.h" + +namespace FlexFlow::Kernels::LayerNorm { + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorW const &gamma, + GenericTensorAccessorW const &beta) { + NOT_IMPLEMENTED(); +} + +void cpu_backward_kernel(GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &gamma, + GenericTensorAccessorW const &gamma_grad, + GenericTensorAccessorW const &beta_grad) { + NOT_IMPLEMENTED(); +} + +} // namespace FlexFlow::Kernels::LayerNorm diff --git a/lib/kernels/src/kernels/legion_dim.cc b/lib/kernels/src/kernels/legion_dim.cc index 47b5b3e5ed..f3fa67387a 100644 --- a/lib/kernels/src/kernels/legion_dim.cc +++ b/lib/kernels/src/kernels/legion_dim.cc @@ -1,8 +1,22 @@ #include "kernels/legion_dim.h" +#include "op-attrs/tensor_dims.h" #include "utils/archetypes/value_type.h" namespace FlexFlow { +positive_int dim_at_idx(TensorDims const &tensor_dims, + legion_dim_t legion_dim) { + return dim_at_idx( + tensor_dims, + ff_dim_from_legion_dim(legion_dim, get_num_dims(tensor_dims))); +} + +positive_int &dim_at_idx(TensorDims &tensor_dims, legion_dim_t legion_dim) { + return dim_at_idx( + tensor_dims, + ff_dim_from_legion_dim(legion_dim, get_num_dims(tensor_dims))); +} + using T = value_type<0>; template std::set key_range(LegionOrdered const &); diff --git a/lib/kernels/src/kernels/linear_kernels.cc b/lib/kernels/src/kernels/linear_kernels.cc new file mode 100644 index 0000000000..f301e89b6e --- /dev/null +++ b/lib/kernels/src/kernels/linear_kernels.cc @@ -0,0 +1,148 @@ +#include "kernels/linear_kernels.h" +#include "kernels/copy_tensor_accessor.h" +#include "kernels/linear_kernels_cpu.h" +#include "kernels/linear_kernels_gpu.h" +#include "kernels/local_cuda_allocator.h" +#include + +using namespace FlexFlow::Kernels::Linear; + +namespace FlexFlow { + +std::optional + linear_init_kernel(DeviceType device_type, + device_handle_t const &handle, + std::optional activation, + std::optional regularizer, + bool use_bias, + DataType input_type, + DataType weight_type, + DataType output_type, + int batch_size, + int channel) { + if (device_type == DeviceType::GPU) { + return gpu_init_kernel( + /*handle=*/handle.require_for_gpu(), + /*activation=*/activation, + /*regularizer=*/regularizer, + /*use_bias=*/use_bias, + /*input_type=*/input_type, + /*weight_type=*/weight_type, + /*output_type=*/output_type, + /*batch_size=*/batch_size, + /*channel=*/channel); + } else { + ASSERT(device_type == DeviceType::CPU); + return std::nullopt; + } +} + +void linear_forward_kernel( + device_stream_t const &stream, + std::optional const &per_device_state, + LinearAttrs const &attrs, + GenericTensorAccessorR const &input_accessor, + GenericTensorAccessorW const &output_accessor, + GenericTensorAccessorR const &filter_accessor, + std::optional const &bias_accessor) { + if (stream.is_gpu()) { + positive_int in_dim = dim_at_idx(input_accessor.shape.dims, ff_dim_t{1_n}); + positive_int out_dim = + dim_at_idx(output_accessor.shape.dims, ff_dim_t{1_n}); + positive_int batch_size = + dim_at_idx(input_accessor.shape.dims, ff_dim_t{0_n}); + + float const *bias_ptr = nullptr; + if (bias_accessor.has_value()) { + bias_ptr = bias_accessor.value().get(); + } + + ASSERT(per_device_state.has_value()); + gpu_forward_kernel( + /*stream=*/stream.require_gpu(), + /*per_device_state=*/per_device_state.value(), + /*input_ptr=*/input_accessor.get_float_ptr(), + /*output_ptr=*/output_accessor.get_float_ptr(), + /*filter_ptr=*/filter_accessor.get_float_ptr(), + /*bias_ptr=*/bias_ptr, + /*in_dim=*/in_dim.int_from_positive_int(), + /*out_dim=*/out_dim.int_from_positive_int(), + /*batch_size=*/batch_size.int_from_positive_int()); + } else { + ASSERT(stream.is_cpu()); + ASSERT(per_device_state == std::nullopt); + linear_cpu_forward_kernel( + /*attrs=*/attrs, + /*input_accessor=*/input_accessor, + /*output_accessor=*/output_accessor, + /*filter_accessor=*/filter_accessor, + /*bias_accessor=*/bias_accessor); + } +} + +void linear_backward_kernel( + device_stream_t const &stream, + std::optional const &per_device_state, + LinearAttrs const &attrs, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &kernel, + GenericTensorAccessorW const &kernel_grad, + std::optional const &bias_grad) { + if (stream.is_gpu()) { + float *bias_grad_ptr = + transform(bias_grad, [](GenericTensorAccessorW const &b) { + return b.get_float_ptr(); + }).value_or(nullptr); + + positive_int in_dim = dim_at_idx(input.shape.dims, ff_dim_t{1_n}); + positive_int out_dim = dim_at_idx(output.shape.dims, ff_dim_t{1_n}); + positive_int batch_size = dim_at_idx(input.shape.dims, ff_dim_t{0_n}); + + Allocator gpu_allocator = create_local_cuda_memory_allocator(); + GenericTensorAccessorW modifiable_output_grad = + copy_tensor_accessor_r(output_grad, gpu_allocator); + + ASSERT(per_device_state.has_value()); + gpu_backward_kernel( + /*stream=*/stream.require_gpu(), + /*per_device_state=*/per_device_state.value(), + /*output_ptr=*/output.get_float_ptr(), + /*output_grad_ptr=*/modifiable_output_grad.get_float_ptr(), + /*input_ptr=*/input.get_float_ptr(), + /*input_grad_ptr=*/input_grad.get_float_ptr(), + /*kernel_ptr=*/kernel.get_float_ptr(), + /*kernel_grad_ptr=*/kernel_grad.get_float_ptr(), + /*bias_grad_ptr=*/bias_grad_ptr, + /*in_dim=*/in_dim.int_from_positive_int(), + /*out_dim=*/out_dim.int_from_positive_int(), + /*batch_size=*/batch_size.int_from_positive_int()); + } else { + ASSERT(stream.is_cpu()); + ASSERT(per_device_state == std::nullopt); + linear_cpu_backward_kernel( + /*attrs=*/attrs, + /*output=*/output, + /*output_grad=*/output_grad, + /*input=*/input, + /*input_grad=*/input_grad, + /*kernel=*/kernel, + /*kernel_grad=*/kernel_grad, + /*bias_grad=*/bias_grad); + } +} + +void linear_cleanup_kernel( + DeviceType device_type, + std::optional &per_device_state) { + if (device_type == DeviceType::GPU) { + gpu_cleanup_kernel(per_device_state.value()); + } else { + ASSERT(device_type == DeviceType::CPU); + ASSERT(per_device_state == std::nullopt); + } +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/linear_kernels_cpu.cc b/lib/kernels/src/kernels/linear_kernels_cpu.cc new file mode 100644 index 0000000000..f26df8081e --- /dev/null +++ b/lib/kernels/src/kernels/linear_kernels_cpu.cc @@ -0,0 +1,96 @@ +#include "kernels/linear_kernels_cpu.h" +#include "kernels/local_cpu_allocator.h" +#include "kernels/map_tensor_accessors.h" +#include "kernels/tensor_accessor_binary_ops.h" +#include "kernels/tensor_accessor_unary_ops.h" +#include "utils/exception.h" +#include "utils/nonnegative_int/nonnegative_range.h" +#include + +namespace FlexFlow { + +void linear_cpu_forward_kernel( + LinearAttrs const &attrs, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output, + GenericTensorAccessorR const &projection, + std::optional const &bias) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + tensor_accessor_matmul_to( + input, tensor_accessor_transpose(projection, cpu_allocator), output); + + ASSERT(attrs.use_bias == bias.has_value()); + if (bias.has_value()) { + GenericTensorAccessorW broadcasted_bias = tensor_accessor_broadcast( + bias.value(), output.shape.dims, cpu_allocator); + tensor_accessor_elementwise_add_to( + read_only_accessor_from_write_accessor(output), + read_only_accessor_from_write_accessor(broadcasted_bias), + output); + } + + if (attrs.activation.has_value()) { + switch (attrs.activation.value()) { + case Activation::RELU: + tensor_accessor_relu_to(read_only_accessor_from_write_accessor(output), + output); + break; + default: + PANIC("Unhandled activation function", attrs.activation.value()); + } + } +} + +// template +static float single_element_relu_bwd(float elem) { + if (elem > 0) { + return 1; + } else { + return 0; + } +} + +void linear_cpu_backward_kernel( + LinearAttrs const &attrs, + GenericTensorAccessorR const &output, + GenericTensorAccessorR const &output_grad, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &input_grad, + GenericTensorAccessorR const &projection, + GenericTensorAccessorW const &projection_grad, + std::optional const &bias_grad) { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + std::optional processed_output_grad = std::nullopt; + if (attrs.activation.has_value()) { + switch (attrs.activation.value()) { + case Activation::RELU: + processed_output_grad = + read_only_accessor_from_write_accessor(map_tensor_accessor( + output_grad, single_element_relu_bwd, cpu_allocator)); + break; + default: + PANIC("Unhandled activation function", attrs.activation.value()); + } + } else { + processed_output_grad = output_grad; + } + + tensor_accessor_matmul_to( + processed_output_grad.value(), projection, input_grad); + tensor_accessor_transpose_to( + tensor_accessor_matmul( + read_only_accessor_from_write_accessor( + tensor_accessor_transpose(input, cpu_allocator)), + processed_output_grad.value(), + cpu_allocator), + projection_grad); + + if (bias_grad.has_value()) { + tensor_accessor_reduce_to( + processed_output_grad.value(), ff_dim_t{0_n}, bias_grad.value()); + } +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/loss_function_kernels.cc b/lib/kernels/src/kernels/loss_function_kernels.cc new file mode 100644 index 0000000000..df96bab9b0 --- /dev/null +++ b/lib/kernels/src/kernels/loss_function_kernels.cc @@ -0,0 +1,126 @@ +#include "kernels/loss_function_kernels.h" +#include "kernels/loss_function_kernels_cpu.h" +#include "kernels/loss_function_kernels_gpu.h" +#include + +namespace FlexFlow { + +void sparse_categorical_crossentropy_loss_backward_kernel( + device_stream_t const &stream, + float *logit_grad_ptr, + float const *logit_ptr, + int const *label_ptr, + size_t logit_volume, + size_t logit_grad_volume, + int num_samples, + int num_classes, + int k, + float scale_factor) { + if (stream.is_gpu()) { + sparse_categorical_crossentropy_loss_backward_gpu_kernel( + /*stream=*/stream.require_gpu(), + /*logit_grad_ptr=*/logit_grad_ptr, + /*logit_ptr=*/logit_ptr, + /*label_ptr=*/label_ptr, + /*logit_volume=*/logit_volume, + /*logit_grad_volume=*/logit_grad_volume, + /*num_samples=*/num_samples, + /*num_classes=*/num_classes, + /*k=*/k, + /*scale_factor=*/scale_factor); + } else { + ASSERT(stream.is_cpu()); + sparse_categorical_crossentropy_loss_backward_cpu_kernel( + /*logit_grad_ptr=*/logit_grad_ptr, + /*logit_ptr=*/logit_ptr, + /*label_ptr=*/label_ptr, + /*logit_volume=*/logit_volume, + /*logit_grad_volume=*/logit_grad_volume, + /*num_samples=*/num_samples, + /*num_classes=*/num_classes, + /*k=*/k, + /*scale_factor=*/scale_factor); + } +} + +void categorical_crossentropy_loss_backward_kernel( + device_stream_t const &stream, + GenericTensorAccessorW const &logit_grad, + GenericTensorAccessorR const &logit, + GenericTensorAccessorR const &label, + float scale_factor) { + if (stream.is_gpu()) { + categorical_crossentropy_loss_backward_gpu_kernel( + /*stream=*/stream.require_gpu(), + /*logit_grad_ptr=*/logit_grad.get_float_ptr(), + /*logit_ptr=*/logit.get_float_ptr(), + /*label_ptr=*/label.get_float_ptr(), + /*logit_volume=*/ + get_num_elements(logit.shape.dims).int_from_positive_int(), + /*logit_grad_volume=*/ + get_num_elements(logit_grad.shape.dims).int_from_positive_int(), + /*scale_factor=*/scale_factor); + } else { + ASSERT(stream.is_cpu()); + categorical_crossentropy_loss_backward_cpu_kernel( + /*logit_grad=*/logit_grad, + /*logit=*/logit, + /*label=*/label, + /*scale_factor=*/scale_factor); + } +} + +void mean_squared_error_avg_loss_backward_kernel(device_stream_t const &stream, + float *logit_grad_ptr, + float const *logit_ptr, + float const *label_ptr, + size_t logit_volume, + size_t logit_grad_volume, + float scale_factor) { + if (stream.is_gpu()) { + mean_squared_error_avg_loss_backward_gpu_kernel( + /*stream=*/stream.require_gpu(), + /*logit_grad_ptr=*/logit_grad_ptr, + /*logit_ptr=*/logit_ptr, + /*label_ptr=*/label_ptr, + /*logit_volume=*/logit_volume, + /*logit_grad_volume=*/logit_grad_volume, + /*scale_factor=*/scale_factor); + } else { + ASSERT(stream.is_cpu()); + mean_squared_error_avg_loss_backward_cpu_kernel( + /*logit_grad_ptr=*/logit_grad_ptr, + /*logit_ptr=*/logit_ptr, + /*label_ptr=*/label_ptr, + /*logit_volume=*/logit_volume, + /*logit_grad_volume=*/logit_grad_volume, + /*scale_factor=*/scale_factor); + } +} + +void identity_loss_backward_kernel(device_stream_t const &stream, + float *loss_grad_ptr, + float const *loss_ptr, + size_t loss_volume, + size_t loss_grad_volume, + float csale_factor) { + if (stream.is_gpu()) { + identity_loss_backward_gpu_kernel( + /*stream=*/stream.require_gpu(), + /*loss_grad_ptr=*/loss_grad_ptr, + /*loss_ptr=*/loss_ptr, + /*loss_volume=*/loss_volume, + /*loss_grad_volume=*/loss_grad_volume, + /*csale_factor=*/csale_factor); + } else { + ASSERT(stream.is_cpu()); + identity_loss_backward_cpu_kernel( + /*loss_grad_ptr=*/loss_grad_ptr, + /*loss_ptr=*/loss_ptr, + /*loss_volume=*/loss_volume, + /*loss_grad_volume=*/loss_grad_volume, + /*csale_factor=*/csale_factor); + } +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/loss_function_kernels_cpu.cc b/lib/kernels/src/kernels/loss_function_kernels_cpu.cc new file mode 100644 index 0000000000..b0d5f95558 --- /dev/null +++ b/lib/kernels/src/kernels/loss_function_kernels_cpu.cc @@ -0,0 +1,51 @@ +#include "kernels/loss_function_kernels_cpu.h" +#include "kernels/tensor_accessor_binary_ops.h" +#include "kernels/tensor_accessor_unary_ops.h" +#include "op-attrs/datatype_value.h" +#include "utils/exception.h" + +namespace FlexFlow { + +void sparse_categorical_crossentropy_loss_backward_cpu_kernel( + float *logit_grad_ptr, + float const *logit_ptr, + int const *label_ptr, + size_t logit_volume, + size_t logit_grad_volume, + int num_samples, + int num_classes, + int k, + float scale_factor) { + NOT_IMPLEMENTED(); +} + +void categorical_crossentropy_loss_backward_cpu_kernel( + GenericTensorAccessorW const &logit_grad, + GenericTensorAccessorR const &logit, + GenericTensorAccessorR const &label, + float scale_factor) { + tensor_accessor_elementwise_subtract_to( + /*lhs=*/logit, + /*rhs=*/label, + /*output=*/logit_grad); + tensor_accessor_scale_by_constant_inplace(logit_grad, scale_factor); +} + +void mean_squared_error_avg_loss_backward_cpu_kernel(float *logit_grad_ptr, + float const *logit_ptr, + float const *label_ptr, + size_t logit_volume, + size_t logit_grad_volume, + float scale_factor) { + NOT_IMPLEMENTED(); +} + +void identity_loss_backward_cpu_kernel(float *loss_grad_ptr, + float const *loss_ptr, + size_t loss_volume, + size_t loss_grad_volume, + float csale_factor) { + NOT_IMPLEMENTED(); +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/optimizer_kernels.cc b/lib/kernels/src/kernels/optimizer_kernels.cc new file mode 100644 index 0000000000..3d92d9ddc9 --- /dev/null +++ b/lib/kernels/src/kernels/optimizer_kernels.cc @@ -0,0 +1,98 @@ +#include "kernels/optimizer_kernels.h" +#include "kernels/optimizer_kernels_cpu.h" +#include "kernels/optimizer_kernels_gpu.h" +#include + +namespace FlexFlow { + +void sgd_update_task(device_stream_t const &stream, + device_handle_t const &handle, + float lr, + float momentum, + bool nesterov, + float weight_decay, + GenericTensorAccessorR const &weight_grad, + int num_replicas, + GenericTensorAccessorW const &weight, + std::optional const &sgd_v) { + ASSERT(sgd_v.has_value() == (momentum > 0.0f)); + + if (stream.is_gpu()) { + float *sgd_v_ptr = nullptr; + if (momentum > 0.0f) { + sgd_v_ptr = sgd_v.value().get_float_ptr(); + } + + gpu_sgd_nccl_update_task( + /*stream=*/stream.require_gpu(), + /*lr=*/lr, + /*momentum=*/momentum, + /*nesterov=*/nesterov, + /*weight_decay=*/weight_decay, + /*handle=*/handle.require_for_gpu(), + /*weight_grad_ptr=*/weight_grad.get_float_ptr(), + /*size=*/ + get_num_elements(weight_grad.shape.dims).int_from_positive_int(), + /*weight_ptr=*/weight.get_float_ptr(), + /*sgd_v_ptr=*/sgd_v_ptr); + } else { + ASSERT(stream.is_cpu()); + ASSERT(handle.is_for_cpu()); + cpu_sgd_update_task( + /*lr=*/lr, + /*momentum=*/momentum, + /*nesterov=*/nesterov, + /*weight_decay=*/weight_decay, + /*weight_grad=*/weight_grad, + /*weight=*/weight, + /*sgd_v=*/sgd_v); + } +} + +void adam_update_task(device_stream_t const &stream, + device_handle_t const &handle, + float alpha_t, + float beta1, + float beta2, + float weight_decay, + float epsilon, + float const *weight_grad_ptr, + size_t size, + int num_replicas, + float *weight_ptr, + float *adam_v_ptr, + float *adam_m_ptr) { + if (stream.is_gpu()) { + ASSERT(stream.is_cpu()); + gpu_adam_nccl_update_task( + /*stream=*/stream.require_gpu(), + /*alpha_t=*/alpha_t, + /*beta1=*/beta1, + /*beta2=*/beta2, + /*weight_decay=*/weight_decay, + /*epsilon=*/epsilon, + /*handle=*/handle.require_for_gpu(), + /*weight_grad_ptr=*/weight_grad_ptr, + /*size=*/size, + /*weight_ptr=*/weight_ptr, + /*adam_v_ptr=*/adam_v_ptr, + /*adam_m_ptr=*/adam_m_ptr); + } else { + ASSERT(stream.is_cpu()); + ASSERT(handle.is_for_cpu()); + cpu_adam_update_task( + /*alpha_t=*/alpha_t, + /*beta1=*/beta1, + /*beta2=*/beta2, + /*weight_decay=*/weight_decay, + /*epsilon=*/epsilon, + /*weight_grad_ptr=*/weight_grad_ptr, + /*size=*/size, + /*num_replicas=*/num_replicas, + /*weight_ptr=*/weight_ptr, + /*adam_v_ptr=*/adam_v_ptr, + /*adam_m_ptr=*/adam_m_ptr); + } +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/optimizer_kernels_cpu.cc b/lib/kernels/src/kernels/optimizer_kernels_cpu.cc new file mode 100644 index 0000000000..7842215972 --- /dev/null +++ b/lib/kernels/src/kernels/optimizer_kernels_cpu.cc @@ -0,0 +1,76 @@ +#include "kernels/optimizer_kernels_cpu.h" +#include "kernels/format_accessor_contents.h" +#include "kernels/local_cpu_allocator.h" +#include "kernels/tensor_accessor_binary_ops.h" +#include "kernels/tensor_accessor_unary_ops.h" +#include "utils/exception.h" + +namespace FlexFlow { + +void cpu_sgd_update_task(float lr, + float momentum, + bool nesterov, + float weight_decay, + GenericTensorAccessorR const &weight_grad, + GenericTensorAccessorW const &weight, + std::optional const &sgd_v) { + // based on sgd_update in lib/kernels/src/cuda/optimizer_kernels.cu + + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + std::cerr << "weight_grad=" << format_accessor_r_contents(weight_grad) + << std::endl + << "weight=" << format_accessor_w_contents(weight) << std::endl; + + GenericTensorAccessorW gt = tensor_accessor_elementwise_add( + weight_grad, + read_only_accessor_from_write_accessor(tensor_accessor_scale_by_constant( + read_only_accessor_from_write_accessor(weight), + weight_decay, + cpu_allocator)), + cpu_allocator); + + if (momentum > 0.0f) { + tensor_accessor_scale_by_constant_inplace(sgd_v.value(), momentum); + tensor_accessor_elementwise_add_to( + read_only_accessor_from_write_accessor(sgd_v.value()), + read_only_accessor_from_write_accessor(gt), + sgd_v.value()); + + if (nesterov) { + tensor_accessor_elementwise_add_to( + read_only_accessor_from_write_accessor(gt), + read_only_accessor_from_write_accessor( + tensor_accessor_scale_by_constant( + read_only_accessor_from_write_accessor(sgd_v.value()), + momentum, + cpu_allocator)), + gt); + } else { + copy_accessor_data_to_l_from_r( + gt, read_only_accessor_from_write_accessor(sgd_v.value())); + } + } + + tensor_accessor_elementwise_subtract_to( + read_only_accessor_from_write_accessor(weight), + read_only_accessor_from_write_accessor(tensor_accessor_scale_by_constant( + read_only_accessor_from_write_accessor(gt), lr, cpu_allocator)), + weight); +} + +void cpu_adam_update_task(float alpha_t, + float beta1, + float beta2, + float weight_decay, + float epsilon, + float const *weight_grad_ptr, + size_t size, + int num_replicas, + float *weight_ptr, + float *adam_v_ptr, + float *adam_m_ptr) { + NOT_IMPLEMENTED(); +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/pool_2d_kernels.cc b/lib/kernels/src/kernels/pool_2d_kernels.cc new file mode 100644 index 0000000000..6ebfc68c86 --- /dev/null +++ b/lib/kernels/src/kernels/pool_2d_kernels.cc @@ -0,0 +1,105 @@ +#include "kernels/pool_2d_kernels.h" +#include "kernels/pool_2d_kernels_cpu.h" +#include "kernels/pool_2d_kernels_gpu.h" + +namespace FlexFlow::Kernels::Pool2D { + +std::optional + init_kernel(DeviceType device_type, + device_handle_t const &handle, + std::optional activation, + int input_w, + int input_h, + int input_c, + int input_n, + int output_w, + int output_h, + int output_c, + int output_n, + int pad_h, + int pad_w, + int kernel_h, + int kernel_w, + int stride_h, + int stride_w, + PoolOp pool_type) { + if (device_type == DeviceType::GPU) { + return gpu_init_kernel( + /*handle=*/handle.require_for_gpu(), + /*activation=*/activation, + /*input_w=*/input_w, + /*input_h=*/input_h, + /*input_c=*/input_c, + /*input_n=*/input_n, + /*output_w=*/output_w, + /*output_h=*/output_h, + /*output_c=*/output_c, + /*output_n=*/output_n, + /*pad_h=*/pad_h, + /*pad_w=*/pad_w, + /*kernel_h=*/kernel_h, + /*kernel_w=*/kernel_w, + /*stride_h=*/stride_h, + /*stride_w=*/stride_w, + /*pool_type=*/pool_type); + } else { + ASSERT(device_type == DeviceType::CPU); + ASSERT(handle.is_for_cpu()); + return std::nullopt; + } +} + +void forward_kernel(device_stream_t const &stream, + std::optional const &per_device_state, + void const *input_ptr, + void *output_ptr) { + if (stream.is_gpu()) { + gpu_forward_kernel( + /*stream=*/stream.require_gpu(), + /*per_device_state=*/per_device_state.value(), + /*input_ptr=*/input_ptr, + /*output_ptr=*/output_ptr); + } else { + ASSERT(stream.is_cpu()); + cpu_forward_kernel( + /*input_ptr=*/input_ptr, + /*output_ptr=*/output_ptr); + } +} + +void backward_kernel( + device_stream_t const &stream, + std::optional const &per_device_state, + void const *output_ptr, + void const *output_grad_ptr, + void const *input_ptr, + void *input_grad_ptr) { + if (stream.is_gpu()) { + gpu_backward_kernel( + /*stream=*/stream.require_gpu(), + /*per_device_state=*/per_device_state.value(), + /*output_ptr=*/output_ptr, + /*output_grad_ptr=*/output_grad_ptr, + /*input_ptr=*/input_ptr, + /*input_grad_ptr=*/input_grad_ptr); + } else { + ASSERT(stream.is_cpu()); + cpu_backward_kernel( + /*output_ptr=*/output_ptr, + /*output_grad_ptr=*/output_grad_ptr, + /*input_ptr=*/input_ptr, + /*input_grad_ptr=*/input_grad_ptr); + } +} + +void cleanup_kernel(DeviceType device_type, + std::optional &per_device_state) { + if (device_type == DeviceType::GPU) { + gpu_cleanup_kernel(per_device_state.value()); + } else { + ASSERT(device_type == DeviceType::CPU); + ASSERT(per_device_state == std::nullopt); + } +} + +} // namespace FlexFlow::Kernels::Pool2D diff --git a/lib/kernels/src/kernels/pool_2d_kernels_cpu.cc b/lib/kernels/src/kernels/pool_2d_kernels_cpu.cc new file mode 100644 index 0000000000..f2d2141e96 --- /dev/null +++ b/lib/kernels/src/kernels/pool_2d_kernels_cpu.cc @@ -0,0 +1,17 @@ +#include "kernels/pool_2d_kernels_cpu.h" +#include "utils/exception.h" + +namespace FlexFlow::Kernels::Pool2D { + +void cpu_forward_kernel(void const *input_ptr, void *output_ptr) { + NOT_IMPLEMENTED(); +} + +void cpu_backward_kernel(void const *output_ptr, + void const *output_grad_ptr, + void const *input_ptr, + void *input_grad_ptr) { + NOT_IMPLEMENTED(); +} + +} // namespace FlexFlow::Kernels::Pool2D diff --git a/lib/kernels/src/kernels/reduce_kernels.cc b/lib/kernels/src/kernels/reduce_kernels.cc new file mode 100644 index 0000000000..bd3d6a8cd1 --- /dev/null +++ b/lib/kernels/src/kernels/reduce_kernels.cc @@ -0,0 +1,62 @@ +#include "kernels/reduce_kernels.h" +#include "kernels/reduce_kernels_cpu.h" +#include "kernels/reduce_kernels_gpu.h" + +namespace FlexFlow::Kernels::Reduce { + +std::optional + init_kernel(DeviceType device_type, + device_handle_t const &handle, + OperatorType const &operator_type, + size_t const &reduction_size, + TensorShape const &input_shape, + TensorShape const &output_shape) { + if (device_type == DeviceType::GPU) { + return gpu_init_kernel(/*handle=*/handle.require_for_gpu(), + /*operator_type=*/operator_type, + /*reduction_size=*/reduction_size, + /*input_shape=*/input_shape, + /*output_shape=*/output_shape); + } else { + ASSERT(device_type == DeviceType::CPU); + ASSERT(handle.is_for_cpu()); + return std::nullopt; + } +} + +void forward_kernel(device_stream_t const &stream, + std::optional const &per_device_state, + float const *input_ptr, + float *output_ptr) { + if (stream.is_gpu()) { + gpu_forward_kernel(/*stream=*/stream.require_gpu(), + /*per_device_state=*/per_device_state.value(), + /*input_ptr=*/input_ptr, + /*output_ptr=*/output_ptr); + } else { + ASSERT(stream.is_cpu()); + ASSERT(per_device_state == std::nullopt); + cpu_forward_kernel(/*input_ptr=*/input_ptr, + /*output_ptr=*/output_ptr); + } +} + +void backward_kernel( + device_stream_t const &stream, + std::optional const &per_device_state, + float const *output_grad_ptr, + float *input_grad_ptr) { + if (stream.is_gpu()) { + gpu_backward_kernel(/*stream=*/stream.require_gpu(), + /*per_device_state=*/per_device_state.value(), + /*output_grad_ptr=*/output_grad_ptr, + /*input_grad_ptr=*/input_grad_ptr); + } else { + ASSERT(stream.is_cpu()); + ASSERT(per_device_state == std::nullopt); + cpu_backward_kernel(/*output_grad_ptr=*/output_grad_ptr, + /*input_grad_ptr=*/input_grad_ptr); + } +} + +} // namespace FlexFlow::Kernels::Reduce diff --git a/lib/kernels/src/kernels/reduce_kernels_cpu.cc b/lib/kernels/src/kernels/reduce_kernels_cpu.cc new file mode 100644 index 0000000000..295e126b49 --- /dev/null +++ b/lib/kernels/src/kernels/reduce_kernels_cpu.cc @@ -0,0 +1,14 @@ +#include "kernels/reduce_kernels_cpu.h" +#include "utils/exception.h" + +namespace FlexFlow::Kernels::Reduce { + +void cpu_forward_kernel(float const *input_ptr, float *output_ptr) { + NOT_IMPLEMENTED(); +} + +void cpu_backward_kernel(float const *output_grad_ptr, float *input_grad_ptr) { + NOT_IMPLEMENTED(); +} + +} // namespace FlexFlow::Kernels::Reduce diff --git a/lib/kernels/src/kernels/reshape_kernels.cc b/lib/kernels/src/kernels/reshape_kernels.cc new file mode 100644 index 0000000000..2ac90352bb --- /dev/null +++ b/lib/kernels/src/kernels/reshape_kernels.cc @@ -0,0 +1,39 @@ +#include "kernels/reshape_kernels.h" +#include "kernels/reshape_kernels_cpu.h" +#include "kernels/reshape_kernels_gpu.h" + +namespace FlexFlow::Kernels::Reshape { + +void forward_kernel(device_stream_t const &stream, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + if (stream.is_gpu()) { + gpu_forward_kernel( + /*stream=*/stream.require_gpu(), + /*input=*/input, + /*output=*/output); + } else { + ASSERT(stream.is_cpu()); + cpu_forward_kernel( + /*input=*/input, + /*output=*/output); + } +} + +void backward_kernel(device_stream_t const &stream, + GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + if (stream.is_gpu()) { + gpu_backward_kernel( + /*stream=*/stream.require_gpu(), + /*output=*/output, + /*input=*/input); + } else { + ASSERT(stream.is_cpu()); + cpu_backward_kernel( + /*output=*/output, + /*input=*/input); + } +} + +} // namespace FlexFlow::Kernels::Reshape diff --git a/lib/kernels/src/kernels/reshape_kernels_cpu.cc b/lib/kernels/src/kernels/reshape_kernels_cpu.cc new file mode 100644 index 0000000000..b48272cdde --- /dev/null +++ b/lib/kernels/src/kernels/reshape_kernels_cpu.cc @@ -0,0 +1,15 @@ +#include "kernels/reshape_kernels_cpu.h" + +namespace FlexFlow::Kernels::Reshape { + +void cpu_forward_kernel(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + NOT_IMPLEMENTED(); +} + +void cpu_backward_kernel(GenericTensorAccessorR const &output, + GenericTensorAccessorW const &input) { + NOT_IMPLEMENTED(); +} + +} // namespace FlexFlow::Kernels::Reshape diff --git a/lib/kernels/src/kernels/reverse_kernels.cc b/lib/kernels/src/kernels/reverse_kernels.cc new file mode 100644 index 0000000000..301c60ea3d --- /dev/null +++ b/lib/kernels/src/kernels/reverse_kernels.cc @@ -0,0 +1,33 @@ +#include "kernels/reverse_kernels.h" +#include "kernels/reverse_kernels_cpu.h" +#include "kernels/reverse_kernels_gpu.h" + +namespace FlexFlow::Kernels::Reverse { + +void forward_kernel(device_stream_t const &stream, + GenericTensorAccessorR const &input_accessor, + GenericTensorAccessorW &output_accessor, + ReverseAttrs const &attrs) { + if (stream.is_gpu()) { + gpu_forward_kernel( + stream.require_gpu(), input_accessor, output_accessor, attrs); + } else { + ASSERT(stream.is_cpu()); + cpu_forward_kernel(input_accessor, output_accessor, attrs); + } +} + +void backward_kernel(device_stream_t const &stream, + GenericTensorAccessorR const &output_accessor, + GenericTensorAccessorW &input_accessor, + ReverseAttrs const &attrs) { + if (stream.is_gpu()) { + gpu_backward_kernel( + stream.require_gpu(), output_accessor, input_accessor, attrs); + } else { + ASSERT(stream.is_cpu()); + cpu_backward_kernel(output_accessor, input_accessor, attrs); + } +} + +} // namespace FlexFlow::Kernels::Reverse diff --git a/lib/kernels/src/cpu/ops/reverse_kernels.cc b/lib/kernels/src/kernels/reverse_kernels_cpu.cc similarity index 64% rename from lib/kernels/src/cpu/ops/reverse_kernels.cc rename to lib/kernels/src/kernels/reverse_kernels_cpu.cc index 212a52881a..e21c986dd8 100644 --- a/lib/kernels/src/cpu/ops/reverse_kernels.cc +++ b/lib/kernels/src/kernels/reverse_kernels_cpu.cc @@ -1,5 +1,5 @@ -#include "kernels/datatype_dispatch.h" #include "kernels/reverse_kernels_cpu.h" +#include "kernels/datatype_dispatch.h" #include namespace FlexFlow::Kernels::Reverse { @@ -9,19 +9,19 @@ struct CPUReverseForwardKernel { void operator()(GenericTensorAccessorR const &input, GenericTensorAccessorW &output, ReverseAttrs const &attrs) { - positive_int reverse_axis_size = input.shape.at(attrs.axis); + positive_int reverse_axis_size = dim_at_idx(input.shape.dims, attrs.axis); - for (ArrayCoord const &input_coord : get_array_coord_set(input.shape)) { + for (TensorDimsCoord const &input_coord : + get_tensor_dims_coord_set(input.shape.dims)) { nonnegative_int input_reverse_axis_coord = input_coord.ff_ordered.at(attrs.axis); - ArrayCoord output_coord = input_coord; + TensorDimsCoord output_coord = input_coord; output_coord.ff_ordered.at(attrs.axis) = nonnegative_int{reverse_axis_size.int_from_positive_int() - input_reverse_axis_coord.unwrap_nonnegative() - 1}; - output.at
(output_coord.ff_ordered) = - input.at
(input_coord.ff_ordered); + output.at
(output_coord) = input.at
(input_coord); } } }; @@ -31,16 +31,17 @@ void cpu_forward_kernel(GenericTensorAccessorR const &input_accessor, ReverseAttrs const &attrs) { DataTypeDispatch1{}( - input_accessor.data_type, input_accessor, output_accessor, attrs); + input_accessor.shape.data_type, input_accessor, output_accessor, attrs); } void cpu_backward_kernel(GenericTensorAccessorR const &output_grad_accessor, GenericTensorAccessorW &input_grad_accessor, ReverseAttrs const &attrs) { - DataTypeDispatch1{}(output_grad_accessor.data_type, - output_grad_accessor, - input_grad_accessor, - attrs); + DataTypeDispatch1{}( + output_grad_accessor.shape.data_type, + output_grad_accessor, + input_grad_accessor, + attrs); } } // namespace FlexFlow::Kernels::Reverse diff --git a/lib/kernels/src/kernels/reverse_kernels_params.cc b/lib/kernels/src/kernels/reverse_kernels_params.cc index 0ad1a5ed20..cf72fb3eef 100644 --- a/lib/kernels/src/kernels/reverse_kernels_params.cc +++ b/lib/kernels/src/kernels/reverse_kernels_params.cc @@ -1,29 +1,31 @@ #include "kernels/reverse_kernels_params.h" +#include "op-attrs/tensor_dims.h" +#include "utils/nonnegative_int/nonnegative_range.h" namespace FlexFlow { ReverseKernelsParams - compute_reverse_kernels_params(ArrayShape const &output_shape, + compute_reverse_kernels_params(TensorDims const &output_dims, ReverseAttrs const &attrs) { auto axis = attrs.axis; positive_int in_blk_size = 1_p; positive_int reverse_dim_size = 1_p; positive_int num_out_blks = 1_p; - for (nonnegative_int i : nonnegative_range(output_shape.num_dims())) { + for (nonnegative_int i : nonnegative_range(get_num_dims(output_dims))) { if (i < axis.value) { - in_blk_size *= output_shape.at(ff_dim_t{i}); + in_blk_size *= dim_at_idx(output_dims, ff_dim_t{i}); } else if (i == axis.value) { - reverse_dim_size = output_shape.at(ff_dim_t{i}); + reverse_dim_size = dim_at_idx(output_dims, ff_dim_t{i}); } else { - num_out_blks *= output_shape.at(ff_dim_t{i}); + num_out_blks *= dim_at_idx(output_dims, ff_dim_t{i}); } } return ReverseKernelsParams{ - num_out_blks, - reverse_dim_size, - in_blk_size, - output_shape.num_elements(), + /*num_out_blks=*/num_out_blks, + /*reverse_dim_size=*/reverse_dim_size, + /*in_blk_size=*/in_blk_size, + /*out_size=*/get_num_elements(output_dims), }; } diff --git a/lib/kernels/src/kernels/softmax_kernels.cc b/lib/kernels/src/kernels/softmax_kernels.cc new file mode 100644 index 0000000000..3cc655dc7c --- /dev/null +++ b/lib/kernels/src/kernels/softmax_kernels.cc @@ -0,0 +1,79 @@ +#include "kernels/softmax_kernels.h" +#include "kernels/softmax_kernels_cpu.h" +#include "kernels/softmax_kernels_gpu.h" +#include + +namespace FlexFlow::Kernels::Softmax { + +std::optional init_kernel(DeviceType device_type, + device_handle_t const &handle, + ff_dim_t dim, + int input_n, + int input_c, + int input_h, + int input_w) { + if (device_type == DeviceType::GPU) { + return gpu_init_kernel( + /*handle=*/handle.require_for_gpu(), + /*dim=*/dim, + /*input_n=*/input_n, + /*input_c=*/input_c, + /*input_h=*/input_h, + /*input_w=*/input_w); + } else { + ASSERT(device_type == DeviceType::CPU); + ASSERT(handle.is_for_cpu()); + return std::nullopt; + } +} + +void forward_kernel( + device_stream_t const &stream, + std::optional const &per_device_state, + float const *input_ptr, + float *output_ptr) { + if (stream.is_gpu()) { + gpu_forward_kernel( + /*stream=*/stream.require_gpu(), + /*per_device_state=*/per_device_state.value(), + /*input_ptr=*/input_ptr, + /*output_ptr=*/output_ptr); + } else { + ASSERT(stream.is_cpu()); + ASSERT(per_device_state == std::nullopt); + cpu_forward_kernel( + /*input_ptr=*/input_ptr, + /*output_ptr=*/output_ptr); + } +} + +void backward_kernel(device_stream_t const &stream, + float const *output_grad_ptr, + float *input_grad_ptr, + size_t num_elements) { + if (stream.is_gpu()) { + gpu_backward_kernel( + /*stream=*/stream.require_gpu(), + /*output_grad_ptr=*/output_grad_ptr, + /*input_grad_ptr=*/input_grad_ptr, + /*num_elements=*/num_elements); + } else { + ASSERT(stream.is_cpu()); + cpu_backward_kernel( + /*output_grad_ptr=*/output_grad_ptr, + /*input_grad_ptr=*/input_grad_ptr, + /*num_elements=*/num_elements); + } +} + +void cleanup_kernel(DeviceType device_type, + std::optional &per_device_state) { + if (device_type == DeviceType::GPU) { + gpu_cleanup_kernel(per_device_state.value()); + } else { + ASSERT(device_type == DeviceType::CPU); + ASSERT(per_device_state == std::nullopt); + } +} + +} // namespace FlexFlow::Kernels::Softmax diff --git a/lib/kernels/src/kernels/softmax_kernels_cpu.cc b/lib/kernels/src/kernels/softmax_kernels_cpu.cc new file mode 100644 index 0000000000..20f9b68299 --- /dev/null +++ b/lib/kernels/src/kernels/softmax_kernels_cpu.cc @@ -0,0 +1,16 @@ +#include "kernels/softmax_kernels_cpu.h" +#include "utils/exception.h" + +namespace FlexFlow::Kernels::Softmax { + +void cpu_forward_kernel(float const *input_ptr, float *output_ptr) { + NOT_IMPLEMENTED(); +} + +void cpu_backward_kernel(float const *output_grad_ptr, + float *input_grad_ptr, + size_t num_elements) { + NOT_IMPLEMENTED(); +} + +} // namespace FlexFlow::Kernels::Softmax diff --git a/lib/kernels/src/kernels/split_kernels.cc b/lib/kernels/src/kernels/split_kernels.cc new file mode 100644 index 0000000000..f38ae2e8af --- /dev/null +++ b/lib/kernels/src/kernels/split_kernels.cc @@ -0,0 +1,63 @@ +#include "kernels/split_kernels.h" +#include "kernels/split_kernels_cpu.h" +#include "kernels/split_kernels_gpu.h" +#include + +namespace FlexFlow::Kernels::Split { + +void forward_kernel(device_stream_t const &stream, + float **out_ptrs, + float const *in_ptr, + int const *out_blk_sizes, + int in_blk_size, + int num_blks, + int numOutputs) { + if (stream.is_gpu()) { + gpu_forward_kernel( + /*stream=*/stream.require_gpu(), + /*out_ptrs=*/out_ptrs, + /*in_ptr=*/in_ptr, + /*out_blk_sizes=*/out_blk_sizes, + /*in_blk_size=*/in_blk_size, + /*num_blks=*/num_blks, + /*numOutputs=*/numOutputs); + } else { + cpu_forward_kernel( + /*out_ptrs=*/out_ptrs, + /*in_ptr=*/in_ptr, + /*out_blk_sizes=*/out_blk_sizes, + /*in_blk_size=*/in_blk_size, + /*num_blks=*/num_blks, + /*numOutputs=*/numOutputs); + } +} + +void backward_kernel(device_stream_t const &stream, + float *in_grad_ptr, + float const **out_grad_ptr, + int const *out_blk_sizes, + int in_blk_size, + int num_blks, + int numOutputs) { + if (stream.is_gpu()) { + gpu_backward_kernel( + /*stream=*/stream.require_gpu(), + /*in_grad_ptr=*/in_grad_ptr, + /*out_grad_ptr=*/out_grad_ptr, + /*out_blk_sizes=*/out_blk_sizes, + /*in_blk_size=*/in_blk_size, + /*num_blks=*/num_blks, + /*numOutputs=*/numOutputs); + } else { + ASSERT(stream.is_cpu()); + cpu_backward_kernel( + /*in_grad_ptr=*/in_grad_ptr, + /*out_grad_ptr=*/out_grad_ptr, + /*out_blk_sizes=*/out_blk_sizes, + /*in_blk_size=*/in_blk_size, + /*num_blks=*/num_blks, + /*numOutputs=*/numOutputs); + } +} + +} // namespace FlexFlow::Kernels::Split diff --git a/lib/kernels/src/kernels/split_kernels_cpu.cc b/lib/kernels/src/kernels/split_kernels_cpu.cc new file mode 100644 index 0000000000..1639848ef4 --- /dev/null +++ b/lib/kernels/src/kernels/split_kernels_cpu.cc @@ -0,0 +1,24 @@ +#include "kernels/split_kernels_cpu.h" +#include "utils/exception.h" + +namespace FlexFlow::Kernels::Split { + +void cpu_forward_kernel(float **out_ptrs, + float const *in_ptr, + int const *out_blk_sizes, + int in_blk_size, + int num_blks, + int numOutputs) { + NOT_IMPLEMENTED(); +} + +void cpu_backward_kernel(float *in_grad_ptr, + float const **out_grad_ptr, + int const *out_blk_sizes, + int in_blk_size, + int num_blks, + int numOutputs) { + NOT_IMPLEMENTED(); +} + +} // namespace FlexFlow::Kernels::Split diff --git a/lib/kernels/src/kernels/tensor_accessor_binary_ops.cc b/lib/kernels/src/kernels/tensor_accessor_binary_ops.cc new file mode 100644 index 0000000000..db7830f926 --- /dev/null +++ b/lib/kernels/src/kernels/tensor_accessor_binary_ops.cc @@ -0,0 +1,143 @@ +#include "kernels/tensor_accessor_binary_ops.h" +#include "kernels/map_tensor_accessors.h" +#include "op-attrs/tensor_shape.h" + +namespace FlexFlow { + +GenericTensorAccessorW + tensor_accessor_elementwise_add(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &output_allocator) { + return map_tensor_accessors2( + lhs, + rhs, + require_same(lhs.shape.data_type, rhs.shape.data_type), + [](auto const &l, auto const &r) { return l + r; }, + output_allocator); +} + +void tensor_accessor_elementwise_add_to(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + GenericTensorAccessorW const &output) { + map_tensor_accessors2_to( + lhs, + rhs, + require_same(lhs.shape.data_type, rhs.shape.data_type), + [](auto const &l, auto const &r) { return l + r; }, + output); +} + +GenericTensorAccessorW + tensor_accessor_elementwise_subtract(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &output_allocator) { + return map_tensor_accessors2( + lhs, + rhs, + require_same(lhs.shape.data_type, rhs.shape.data_type), + [](auto const &l, auto const &r) { return l - r; }, + output_allocator); +} + +void tensor_accessor_elementwise_subtract_to( + GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + GenericTensorAccessorW const &output) { + map_tensor_accessors2_to( + lhs, + rhs, + require_same(lhs.shape.data_type, rhs.shape.data_type), + [](auto const &l, auto const &r) { return l - r; }, + output); +} + +GenericTensorAccessorW + tensor_accessor_elementwise_multiply(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &output_allocator) { + return map_tensor_accessors2( + lhs, + rhs, + require_same(lhs.shape.data_type, rhs.shape.data_type), + [](auto const &l, auto const &r) { return l * r; }, + output_allocator); +} + +void tensor_accessor_elementwise_multiply_to( + GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + GenericTensorAccessorW const &output) { + map_tensor_accessors2_to( + lhs, + rhs, + require_same(lhs.shape.data_type, rhs.shape.data_type), + [](auto const &l, auto const &r) { return l * r; }, + output); +} + +static TensorShape get_matmul_output_shape(TensorShape const &lhs, + TensorShape const &rhs) { + ASSERT(get_num_dims(lhs.dims) == 2); + ASSERT(get_num_dims(rhs.dims) == 2); + ASSERT(lhs.data_type == DataType::FLOAT); + ASSERT(rhs.data_type == DataType::FLOAT); + ASSERT(dim_at_idx(lhs.dims, relative_ff_dim_t{1}) == + dim_at_idx(rhs.dims, relative_ff_dim_t{0})); + + return TensorShape{ + TensorDims{FFOrdered{ + dim_at_idx(lhs.dims, relative_ff_dim_t{0}), + dim_at_idx(rhs.dims, relative_ff_dim_t{1}), + }}, + DataType::FLOAT, + }; +} + +GenericTensorAccessorW tensor_accessor_matmul(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + Allocator &output_allocator) { + TensorShape output_shape = + get_matmul_output_shape(get_tensor_shape_for_accessor_r(lhs), + get_tensor_shape_for_accessor_r(rhs)); + + GenericTensorAccessorW output = + output_allocator.allocate_tensor(output_shape); + + tensor_accessor_matmul_to(lhs, rhs, output); + + return output; +} + +void tensor_accessor_matmul_to(GenericTensorAccessorR const &lhs, + GenericTensorAccessorR const &rhs, + GenericTensorAccessorW const &output) { + TensorShape output_shape = + get_matmul_output_shape(get_tensor_shape_for_accessor_r(lhs), + get_tensor_shape_for_accessor_r(rhs)); + + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR lhs_cpu = + copy_tensor_accessor_r_to_cpu_if_necessary(lhs, cpu_allocator); + GenericTensorAccessorR rhs_cpu = + copy_tensor_accessor_r_to_cpu_if_necessary(rhs, cpu_allocator); + GenericTensorAccessorW output_cpu = + cpu_allocator.allocate_tensor(output_shape); + + for (nonnegative_int i : + nonnegative_range(dim_at_idx(lhs.shape.dims, ff_dim_t{0_n}))) { + for (nonnegative_int j : + nonnegative_range(dim_at_idx(rhs.shape.dims, ff_dim_t{1_n}))) { + float accum = 0.0f; + for (nonnegative_int k : + nonnegative_range(dim_at_idx(lhs.shape.dims, ff_dim_t{1_n}))) { + accum += lhs_cpu.at(TensorDimsCoord{FFOrdered{i, k}}) * + rhs_cpu.at(TensorDimsCoord{FFOrdered{k, j}}); + } + output_cpu.at(TensorDimsCoord{FFOrdered{i, j}}) = accum; + } + } + + return copy_accessor_data_to_l_from_r(output, output_cpu); +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/tensor_accessor_reductions.cc b/lib/kernels/src/kernels/tensor_accessor_reductions.cc index b11791d32c..199de51ff7 100644 --- a/lib/kernels/src/kernels/tensor_accessor_reductions.cc +++ b/lib/kernels/src/kernels/tensor_accessor_reductions.cc @@ -5,7 +5,7 @@ namespace FlexFlow { bool tensor_accessor_all(GenericTensorAccessorR const &t) { - ASSERT(t.data_type == DataType::BOOL); + ASSERT(t.shape.data_type == DataType::BOOL); return reduce_tensor_accessor_in_all_dims( t, @@ -16,7 +16,7 @@ bool tensor_accessor_all(GenericTensorAccessorR const &t) { } bool tensor_accessor_any(GenericTensorAccessorR const &t) { - ASSERT(t.data_type == DataType::BOOL); + ASSERT(t.shape.data_type == DataType::BOOL); return reduce_tensor_accessor_in_all_dims( t, diff --git a/lib/kernels/src/kernels/tensor_accessor_unary_ops.cc b/lib/kernels/src/kernels/tensor_accessor_unary_ops.cc new file mode 100644 index 0000000000..0a17e19f80 --- /dev/null +++ b/lib/kernels/src/kernels/tensor_accessor_unary_ops.cc @@ -0,0 +1,247 @@ +#include "kernels/tensor_accessor_unary_ops.h" +#include "kernels/datatype_dispatch.h" +#include "kernels/fill_tensor_accessor.h" +#include "kernels/map_tensor_accessors.h" +#include "op-attrs/datatype_value.h" +#include "op-attrs/ff_ordered/concat.h" +#include "op-attrs/ff_ordered/reversed.h" +#include "op-attrs/ff_ordered/slice.h" +#include "op-attrs/tensor_dims.h" +#include "op-attrs/tensor_dims_coord.h" + +namespace FlexFlow { + +GenericTensorAccessorW + tensor_accessor_scale_by_constant(GenericTensorAccessorR const &t, + float constant, + Allocator &output_allocator) { + ASSERT(t.shape.data_type == DataType::FLOAT); + + return map_tensor_accessor( + t, [&](auto const &elem) { return elem * constant; }, output_allocator); +} + +void tensor_accessor_scale_by_constant_inplace(GenericTensorAccessorW const &t, + float constant) { + ASSERT(t.shape.data_type == DataType::FLOAT); + + return map_tensor_accessor_inplace( + t, [&](auto const &elem) { return elem * constant; }); +} + +template +static T single_element_relu(T elem) { + if (elem >= 0) { + return elem; + } else { + return 0; + } +} + +void tensor_accessor_relu_to(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + map_tensor_accessor_to( + input, [](auto elem) { return single_element_relu(elem); }, output); +} + +GenericTensorAccessorW tensor_accessor_relu(GenericTensorAccessorR const &input, + Allocator &output_allocator) { + return map_tensor_accessor( + input, + [](auto elem) { return single_element_relu(elem); }, + output_allocator); +} + +template +struct CPUTensorAccessorBroadcast { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + + for (TensorDimsCoord const &output_coord : + get_tensor_dims_coord_set(output.shape.dims)) { + TensorDimsCoord input_coord = get_broadcast_src_coord( + /*input_dims=*/input.shape.dims, + /*output_dims=*/output.shape.dims, + /*dst_coord=*/output_coord); + + output.at
(output_coord) = input.at
(input_coord); + } + } +}; + +void tensor_accessor_broadcast_to(GenericTensorAccessorR const &input, + TensorDims const &output_dims, + GenericTensorAccessorW const &output) { + ASSERT(tensor_dims_is_broadcastable_to(input.shape.dims, output_dims)); + + TensorShape output_shape = TensorShape{output_dims, input.shape.data_type}; + ASSERT(get_tensor_shape_for_accessor_w(output) == output_shape); + + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR input_cpu = + copy_tensor_accessor_r_to_cpu_if_necessary(input, cpu_allocator); + + GenericTensorAccessorW output_cpu = + cpu_allocator.allocate_tensor(output_shape); + + DataTypeDispatch1{}( + input.shape.data_type, input_cpu, output_cpu); + + copy_accessor_data_to_l_from_r(output, output_cpu); +} + +GenericTensorAccessorW + tensor_accessor_broadcast(GenericTensorAccessorR const &input, + TensorDims const &output_dims, + Allocator &output_allocator) { + + TensorShape output_shape = TensorShape{output_dims, input.shape.data_type}; + + GenericTensorAccessorW output = + output_allocator.allocate_tensor(output_shape); + + tensor_accessor_broadcast_to(input, output_dims, output); + + return output; +} + +template +struct CPUTensorAccessorTranspose { + void operator()(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + ASSERT(get_num_dims(input.shape.dims) == 2); + ASSERT(get_num_dims(output.shape.dims) == 2); + + for (TensorDimsCoord const &input_coord : + get_tensor_dims_coord_set(input.shape.dims)) { + ASSERT(input_coord.ff_ordered.size() == 2); + + TensorDimsCoord output_coord = TensorDimsCoord{ + reversed(input_coord.ff_ordered), + }; + + output.at
(output_coord) = input.at
(input_coord); + } + } +}; + +static TensorShape get_transpose_output_shape(TensorShape const &input_shape) { + return TensorShape{ + TensorDims{ + reversed(input_shape.dims.ff_ordered), + }, + input_shape.data_type, + }; +} + +void tensor_accessor_transpose_to(GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + ASSERT(get_num_dims(input.shape.dims) == 2); + + TensorShape output_shape = + get_transpose_output_shape(get_tensor_shape_for_accessor_r(input)); + ASSERT(get_tensor_shape_for_accessor_w(output) == output_shape); + + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR input_cpu = + copy_tensor_accessor_r_to_cpu_if_necessary(input, cpu_allocator); + + GenericTensorAccessorW output_cpu = + cpu_allocator.allocate_tensor(output_shape); + + DataTypeDispatch1{}( + input.shape.data_type, input_cpu, output_cpu); + + copy_accessor_data_to_l_from_r(output, output_cpu); +} + +GenericTensorAccessorW + tensor_accessor_transpose(GenericTensorAccessorR const &input, + Allocator &output_allocator) { + + TensorShape output_shape = + get_transpose_output_shape(get_tensor_shape_for_accessor_r(input)); + + GenericTensorAccessorW output = + output_allocator.allocate_tensor(output_shape); + + tensor_accessor_transpose_to(input, output); + + return output; +} + +template +struct CPUTensorAccessorReduce { + void operator()(GenericTensorAccessorR const &input, + ff_dim_t reduction_dim, + GenericTensorAccessorW const &output) { + fill_with_zeros(output); + + for (TensorDimsCoord const &input_coord : + get_tensor_dims_coord_set(input.shape.dims)) { + TensorDimsCoord output_coord = tensor_dims_coord_drop_dims( + input_coord, [&](ff_dim_t input_coord_dim) { + return input_coord_dim == reduction_dim; + }); + + output.at
(output_coord) += input.at
(input_coord); + } + } +}; + +static TensorShape get_reduce_output_shape(TensorShape const &input_shape, + ff_dim_t reduction_dim) { + ASSERT(tensor_dims_has_dim(input_shape.dims, reduction_dim), + input_shape.dims, + reduction_dim); + + return TensorShape{ + TensorDims{ + concat( + slice(input_shape.dims.ff_ordered, ff_dim_t{0_n}, reduction_dim), + slice(input_shape.dims.ff_ordered, + ff_dim_t{reduction_dim.value + 1_n}, + std::nullopt)), + }, + input_shape.data_type, + }; +} + +void tensor_accessor_reduce_to(GenericTensorAccessorR const &input, + ff_dim_t reduction_dim, + GenericTensorAccessorW const &output) { + + TensorShape output_shape = get_reduce_output_shape( + get_tensor_shape_for_accessor_r(input), reduction_dim); + ASSERT(get_tensor_shape_for_accessor_r(output) == output_shape); + + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + GenericTensorAccessorR input_cpu = + copy_tensor_accessor_r_to_cpu_if_necessary(input, cpu_allocator); + + GenericTensorAccessorW output_cpu = + cpu_allocator.allocate_tensor(output_shape); + + DataTypeDispatch1{}( + input.shape.data_type, input_cpu, reduction_dim, output_cpu); + + copy_accessor_data_to_l_from_r(output, output_cpu); +} + +GenericTensorAccessorW + tensor_accessor_reduce(GenericTensorAccessorR const &input, + ff_dim_t reduction_dim, + Allocator &output_allocator) { + + TensorShape output_shape = get_reduce_output_shape( + get_tensor_shape_for_accessor_r(input), reduction_dim); + + GenericTensorAccessorW output = + output_allocator.allocate_tensor(output_shape); + + tensor_accessor_reduce_to(input, reduction_dim, output); + + return output; +} + +} // namespace FlexFlow diff --git a/lib/kernels/src/kernels/topk_kernels.cc b/lib/kernels/src/kernels/topk_kernels.cc new file mode 100644 index 0000000000..a3a3c616b3 --- /dev/null +++ b/lib/kernels/src/kernels/topk_kernels.cc @@ -0,0 +1,67 @@ +#include "kernels/topk_kernels.h" +#include "kernels/topk_kernels_cpu.h" +#include "kernels/topk_kernels_gpu.h" +#include + +namespace FlexFlow::Kernels::TopK { + +void forward_kernel(device_stream_t const &stream, + float const *input_ptr, + float *output_ptr, + int *indices_ptr, + size_t batch_size, + int length, + int k, + bool sorted) { + if (stream.is_gpu()) { + gpu_forward_kernel( + /*stream=*/stream.require_gpu(), + /*input_ptr=*/input_ptr, + /*output_ptr=*/output_ptr, + /*indices_ptr=*/indices_ptr, + /*batch_size=*/batch_size, + /*length=*/length, + /*k=*/k, + /*sorted=*/sorted); + } else { + ASSERT(stream.is_cpu()); + cpu_forward_kernel( + /*input_ptr=*/input_ptr, + /*output_ptr=*/output_ptr, + /*indices_ptr=*/indices_ptr, + /*batch_size=*/batch_size, + /*length=*/length, + /*k=*/k, + /*sorted=*/sorted); + } +} + +void backward_kernel(device_stream_t const &stream, + float const *out_grad_ptr, + int const *indices_ptr, + float *in_grad_ptr, + size_t batch_size, + int length, + int k) { + if (stream.is_gpu()) { + gpu_backward_kernel( + /*stream=*/stream.require_gpu(), + /*out_grad_ptr=*/out_grad_ptr, + /*indices_ptr=*/indices_ptr, + /*in_grad_ptr=*/in_grad_ptr, + /*batch_size=*/batch_size, + /*length=*/length, + /*k=*/k); + } else { + ASSERT(stream.is_cpu()); + cpu_backward_kernel( + /*out_grad_ptr=*/out_grad_ptr, + /*indices_ptr=*/indices_ptr, + /*in_grad_ptr=*/in_grad_ptr, + /*batch_size=*/batch_size, + /*length=*/length, + /*k=*/k); + } +} + +} // namespace FlexFlow::Kernels::TopK diff --git a/lib/kernels/src/kernels/topk_kernels_cpu.cc b/lib/kernels/src/kernels/topk_kernels_cpu.cc new file mode 100644 index 0000000000..86ab45f773 --- /dev/null +++ b/lib/kernels/src/kernels/topk_kernels_cpu.cc @@ -0,0 +1,25 @@ +#include "kernels/topk_kernels_cpu.h" +#include "utils/exception.h" + +namespace FlexFlow::Kernels::TopK { + +void cpu_forward_kernel(float const *input_ptr, + float *output_ptr, + int *indices_ptr, + size_t batch_size, + int length, + int k, + bool sorted) { + NOT_IMPLEMENTED(); +} + +void cpu_backward_kernel(float const *out_grad_ptr, + int const *indices_ptr, + float *in_grad_ptr, + size_t batch_size, + int length, + int k) { + NOT_IMPLEMENTED(); +} + +} // namespace FlexFlow::Kernels::TopK diff --git a/lib/kernels/src/kernels/transpose_kernels.cc b/lib/kernels/src/kernels/transpose_kernels.cc new file mode 100644 index 0000000000..bb3775a073 --- /dev/null +++ b/lib/kernels/src/kernels/transpose_kernels.cc @@ -0,0 +1,45 @@ +#include "kernels/transpose_kernels.h" +#include "kernels/transpose_kernels_cpu.h" +#include "kernels/transpose_kernels_gpu.h" + +namespace FlexFlow::Kernels::Transpose { + +void forward_kernel(device_stream_t const &stream, + TransposeAttrs const &attrs, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + if (stream.is_gpu()) { + gpu_forward_kernel( + /*stream=*/stream.require_gpu(), + /*attrs=*/attrs, + /*input=*/input, + /*output=*/output); + } else { + ASSERT(stream.is_cpu()); + cpu_forward_kernel( + /*attrs=*/attrs, + /*input=*/input, + /*output=*/output); + } +} + +void backward_kernel(device_stream_t const &stream, + TransposeAttrs const &attrs, + GenericTensorAccessorR const &out_grad, + GenericTensorAccessorW const &in_grad) { + if (stream.is_gpu()) { + gpu_backward_kernel( + /*stream=*/stream.require_gpu(), + /*attrs=*/attrs, + /*out_grad=*/out_grad, + /*in_grad=*/in_grad); + } else { + ASSERT(stream.is_cpu()); + cpu_forward_kernel( + /*attrs=*/attrs, + /*out_grad=*/out_grad, + /*in_grad=*/in_grad); + } +} + +} // namespace FlexFlow::Kernels::Transpose diff --git a/lib/kernels/src/kernels/transpose_kernels_cpu.cc b/lib/kernels/src/kernels/transpose_kernels_cpu.cc new file mode 100644 index 0000000000..7950e71eb4 --- /dev/null +++ b/lib/kernels/src/kernels/transpose_kernels_cpu.cc @@ -0,0 +1,17 @@ +#include "kernels/transpose_kernels_cpu.h" + +namespace FlexFlow::Kernels::Transpose { + +void cpu_forward_kernel(TransposeAttrs const &attrs, + GenericTensorAccessorR const &input, + GenericTensorAccessorW const &output) { + NOT_IMPLEMENTED(); +} + +void cpu_backward_kernel(TransposeAttrs const &attrs, + GenericTensorAccessorR const &out_grad, + GenericTensorAccessorW const &in_grad) { + NOT_IMPLEMENTED(); +} + +} // namespace FlexFlow::Kernels::Transpose diff --git a/lib/kernels/src/managed_per_device_ff_handle.cc b/lib/kernels/src/managed_per_device_ff_handle.cc index 305a6c935c..cccc46d6bf 100644 --- a/lib/kernels/src/managed_per_device_ff_handle.cc +++ b/lib/kernels/src/managed_per_device_ff_handle.cc @@ -51,6 +51,19 @@ PerDeviceFFHandle const &ManagedPerDeviceFFHandle::raw_handle() const { return *handle; } +std::optional + create_local_handle_for_device_type(DeviceType device_type, + size_t workSpaceSize, + bool allowTensorOpMathConversion) { + if (device_type == DeviceType::CPU) { + return std::nullopt; + } else { + return initialize_single_gpu_handle( + /*workSpaceSize=*/workSpaceSize, + /*allowTensorOpMathConversion=*/allowTensorOpMathConversion); + } +} + ManagedPerDeviceFFHandle initialize_single_gpu_handle(size_t workSpaceSize, bool allowTensorOpMathConversion) { diff --git a/lib/kernels/test/src/cpu/ops/replicate_kernels.cc b/lib/kernels/test/src/cpu/ops/replicate_kernels.cc deleted file mode 100644 index 1984fd5f83..0000000000 --- a/lib/kernels/test/src/cpu/ops/replicate_kernels.cc +++ /dev/null @@ -1,59 +0,0 @@ -#include "kernels/test_utils.h" -#include "kernels/create_accessor_with_contents.h" -#include "kernels/format_accessor_contents.h" -#include "kernels/replicate_kernels_cpu.h" -#include "test/utils/doctest/check_kv.h" -#include - -using namespace ::FlexFlow; - -TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("Replicate::cpu_forward_kernel") { - Allocator cpu_allocator = create_local_cpu_memory_allocator(); - - GenericTensorAccessorR input = - create_1d_accessor_r_with_contents({1, 3, 2}, cpu_allocator); - - TensorShape result_shape = TensorShape{ - TensorDims{FFOrdered{3_p}}, - DataType::INT32, - }; - GenericTensorAccessorW result = - create_zero_filled_accessor_w(result_shape, cpu_allocator); - - GenericTensorAccessorR correct = input; - - Kernels::Replicate::cpu_forward_kernel(input, result); - - CHECK_MESSAGE(accessors_are_equal(result, correct), - "result=", - format_accessor_w_contents(result)); - } - - TEST_CASE("Replicate::cpu_backward_kernel") { - Allocator cpu_allocator = create_local_cpu_memory_allocator(); - - GenericTensorAccessorR output = create_2d_accessor_r_with_contents( - { - {1, 2, 3}, - {4, 3, 3}, - {1, 3, 5}, - }, - cpu_allocator); - - GenericTensorAccessorR correct = - create_1d_accessor_r_with_contents( - {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator); - - TensorShape result_shape = TensorShape{ - TensorDims{FFOrdered{3_p}}, - DataType::INT32, - }; - GenericTensorAccessorW result = - create_zero_filled_accessor_w(result_shape, cpu_allocator); - Kernels::Replicate::cpu_backward_kernel(output, result, 3); - - CHECK_MESSAGE(accessors_are_equal(result, correct), - check_kv("result", format_accessor_w_contents(result))); - } -} diff --git a/lib/kernels/src/test_utils.cc b/lib/kernels/test/src/internal/test_utils.cc similarity index 85% rename from lib/kernels/src/test_utils.cc rename to lib/kernels/test/src/internal/test_utils.cc index 67f2fb624a..b30c656f9a 100644 --- a/lib/kernels/src/test_utils.cc +++ b/lib/kernels/test/src/internal/test_utils.cc @@ -1,4 +1,5 @@ -#include "kernels/test_utils.h" +#include "internal/test_utils.h" +#include "kernels/fill_tensor_accessor.h" #include "op-attrs/tensor_shape.h" #include "utils/containers/require_all_same1.h" #include "utils/join_strings.h" @@ -32,7 +33,7 @@ struct CreateRandomFilledAccessorW { std::random_device rd; std::mt19937 gen(rd()); - size_t num_elements = get_num_elements(shape).int_from_positive_int(); + size_t num_elements = get_num_elements(shape.dims).int_from_positive_int(); if constexpr (std::is_same::value) { std::bernoulli_distribution dist(0.5); for (size_t i = 0; i < num_elements; i++) { @@ -71,28 +72,6 @@ GenericTensorAccessorR create_random_filled_accessor_r(TensorShape const &shape, return read_only_accessor_from_write_accessor(accessor); } -template -struct FillWithZeros { - void operator()(GenericTensorAccessorW const &accessor) { - using T = real_type_t
; - - if (accessor.device_type == DeviceType::CPU) { - memset(accessor.ptr, - 0, - accessor.shape.num_elements().int_from_positive_int() * sizeof(T)); - } else { - checkCUDA(cudaMemset( - accessor.ptr, - 0, - accessor.shape.num_elements().int_from_positive_int() * sizeof(T))); - } - } -}; - -void fill_with_zeros(GenericTensorAccessorW const &accessor) { - DataTypeDispatch1{}(accessor.data_type, accessor); -} - template struct CPUAccessorRContainsNonZero { bool operator()(GenericTensorAccessorR const &accessor) { @@ -100,7 +79,7 @@ struct CPUAccessorRContainsNonZero { T const *data_ptr = accessor.get
(); - int volume = accessor.shape.num_elements().int_from_positive_int(); + int volume = get_num_elements(accessor.shape.dims).int_from_positive_int(); for (size_t i = 0; i < volume; i++) { if (data_ptr[i] != 0) { return true; @@ -116,7 +95,7 @@ bool contains_non_zero(GenericTensorAccessorR const &accessor) { GenericTensorAccessorR cpu_accessor = copy_tensor_accessor_r_to_cpu_if_necessary(accessor, cpu_allocator); return DataTypeDispatch1{}( - cpu_accessor.data_type, cpu_accessor); + cpu_accessor.shape.data_type, cpu_accessor); } template @@ -133,7 +112,8 @@ struct AccessorsAreEqual { T const *a_data_ptr = cpu_accessor_a.get
(); T const *b_data_ptr = cpu_accessor_b.get
(); - int volume = accessor_a.shape.num_elements().int_from_positive_int(); + int volume = + get_num_elements(accessor_a.shape.dims).int_from_positive_int(); for (size_t i = 0; i < volume; i++) { if (a_data_ptr[i] != b_data_ptr[i]) { return false; @@ -150,7 +130,7 @@ bool accessors_are_equal(GenericTensorAccessorR const &accessor_a, "accessors_are_equal expects accessors to have the same shape"); return DataTypeDispatch1{}( - accessor_a.data_type, accessor_a, accessor_b); + accessor_a.shape.data_type, accessor_a, accessor_b); } template @@ -171,7 +151,8 @@ struct CreateFilledAccessorW { T *data_ptr = src_accessor.get
(); - int volume = dst_accessor.shape.num_elements().int_from_positive_int(); + int volume = + get_num_elements(dst_accessor.shape.dims).int_from_positive_int(); for (size_t i = 0; i < volume; i++) { data_ptr[i] = unwrapped_value; } diff --git a/lib/kernels/include/kernels/test_utils.h b/lib/kernels/test/src/internal/test_utils.h similarity index 96% rename from lib/kernels/include/kernels/test_utils.h rename to lib/kernels/test/src/internal/test_utils.h index 9147b667d6..3a2c9b773c 100644 --- a/lib/kernels/include/kernels/test_utils.h +++ b/lib/kernels/test/src/internal/test_utils.h @@ -31,8 +31,6 @@ GenericTensorAccessorR create_zero_filled_accessor_r(TensorShape const &shape, bool contains_non_zero(GenericTensorAccessorR const &accessor); -void fill_with_zeros(GenericTensorAccessorW const &accessor); - void print_2d_tensor_accessor_contents(GenericTensorAccessorR const &accessor, std::ostream &stream); diff --git a/lib/kernels/test/src/kernels/accessor.cc b/lib/kernels/test/src/kernels/accessor.cc index 31a6cba205..b5daf80011 100644 --- a/lib/kernels/test/src/kernels/accessor.cc +++ b/lib/kernels/test/src/kernels/accessor.cc @@ -1,5 +1,5 @@ #include "kernels/accessor.h" -#include "kernels/test_utils.h" +#include "internal/test_utils.h" #include "kernels/create_accessor_with_contents.h" #include "kernels/local_cpu_allocator.h" #include @@ -9,9 +9,9 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("calculate_accessor_offset") { SUBCASE("one dimension") { - std::vector indices = {4_n}; - ArrayShape shape = ArrayShape{ - std::vector{ + TensorDimsCoord indices = TensorDimsCoord{FFOrdered{4_n}}; + TensorDims shape = TensorDims{ + FFOrdered{ 13_p, }, }; @@ -22,24 +22,43 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(result == correct); } + SUBCASE("2d tensor is row-major") { + positive_int num_rows = 5_p; + positive_int num_cols = 6_p; + + TensorDims shape = TensorDims{ + FFOrdered{ + num_rows, + num_cols, + }, + }; + + CHECK(calculate_accessor_offset(TensorDimsCoord{FFOrdered{0_n, 0_n}}, + shape) == 0_n); + CHECK(calculate_accessor_offset(TensorDimsCoord{FFOrdered{1_n, 0_n}}, + shape) == num_cols); + CHECK(calculate_accessor_offset(TensorDimsCoord{FFOrdered{0_n, 1_n}}, + shape) == 1_p); + } + SUBCASE("multiple dimensions") { - std::vector indices = {2_n, 4_n}; - ArrayShape shape = ArrayShape{ - std::vector{ - 6_p, + TensorDimsCoord indices = TensorDimsCoord{FFOrdered{2_n, 4_n}}; + TensorDims shape = TensorDims{ + FFOrdered{ 5_p, + 6_p, }, }; nonnegative_int result = calculate_accessor_offset(indices, shape); - nonnegative_int correct = 2_n * 5_n + 4_n; + nonnegative_int correct = 2_n * 6_n + 4_n; CHECK(result == correct); } SUBCASE("zero dimensions") { - std::vector indices = {}; - ArrayShape shape = ArrayShape{std::vector{}}; + TensorDimsCoord indices = TensorDimsCoord{FFOrdered{}}; + TensorDims shape = TensorDims{FFOrdered{}}; nonnegative_int result = calculate_accessor_offset(indices, shape); nonnegative_int correct = 0_n; @@ -48,11 +67,11 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("index and shape dimensions do not match") { - std::vector indices = {1_n, 2_n, 4_n}; - ArrayShape shape = ArrayShape{ - std::vector{ - 6_p, + TensorDimsCoord indices = TensorDimsCoord{FFOrdered{1_n, 2_n, 4_n}}; + TensorDims shape = TensorDims{ + FFOrdered{ 5_p, + 6_p, }, }; @@ -60,11 +79,11 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("out of bounds index") { - std::vector indices = {2_n, 5_n}; - ArrayShape shape = ArrayShape{ - std::vector{ - 6_p, + TensorDimsCoord indices = TensorDimsCoord{FFOrdered{2_n, 6_n}}; + TensorDims shape = TensorDims{ + FFOrdered{ 5_p, + 6_p, }, }; diff --git a/lib/kernels/test/src/kernels/array_shape.cc b/lib/kernels/test/src/kernels/array_shape.cc deleted file mode 100644 index b3ccbc688c..0000000000 --- a/lib/kernels/test/src/kernels/array_shape.cc +++ /dev/null @@ -1,87 +0,0 @@ -#include "kernels/array_shape.h" -#include "test/utils/doctest/fmt/unordered_set.h" -#include - -using namespace ::FlexFlow; - -TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("get_array_coord_set") { - SUBCASE("ArrayShape is not empty") { - ArrayShape input = ArrayShape{ - LegionOrdered{2_p, 1_p, 3_p}, - }; - - std::unordered_set result = get_array_coord_set(input); - std::unordered_set correct = { - ArrayCoord{FFOrdered{0_n, 0_n, 0_n}}, - ArrayCoord{FFOrdered{0_n, 0_n, 1_n}}, - ArrayCoord{FFOrdered{1_n, 0_n, 0_n}}, - ArrayCoord{FFOrdered{1_n, 0_n, 1_n}}, - ArrayCoord{FFOrdered{2_n, 0_n, 0_n}}, - ArrayCoord{FFOrdered{2_n, 0_n, 1_n}}, - }; - - CHECK(result == correct); - } - - SUBCASE("ArrayShape is zero-dimensional") { - ArrayShape input = ArrayShape{LegionOrdered{}}; - - std::unordered_set result = get_array_coord_set(input); - std::unordered_set correct = { - ArrayCoord{FFOrdered{}}, - }; - - CHECK(result == correct); - } - } - - TEST_CASE("array_shape_drop_dims") { - ArrayShape input = ArrayShape{ - LegionOrdered{2_p, 4_p, 3_p}, - }; - - SUBCASE("removes dims specified to be dropped") { - auto should_drop_dim = [](ff_dim_t dim) -> bool { - return dim.value % 2_n == 0; - }; - - ArrayShape result = array_shape_drop_dims(input, should_drop_dim); - ArrayShape correct = ArrayShape{ - LegionOrdered{4_p}, - }; - - CHECK(result == correct); - } - - SUBCASE( - "is identity function if no dimensions are specified to be dropped") { - auto should_drop_dim = [](ff_dim_t dim) -> bool { return false; }; - - ArrayShape result = array_shape_drop_dims(input, should_drop_dim); - ArrayShape correct = input; - - CHECK(result == correct); - } - - SUBCASE( - "is identity function if no dimensions are specified to be dropped") { - auto should_drop_dim = [](ff_dim_t dim) -> bool { return false; }; - - ArrayShape result = array_shape_drop_dims(input, should_drop_dim); - ArrayShape correct = input; - - CHECK(result == correct); - } - - SUBCASE( - "returns empty shape if all dimensions are specified to be dropped") { - auto should_drop_dim = [](ff_dim_t dim) -> bool { return true; }; - - ArrayShape result = array_shape_drop_dims(input, should_drop_dim); - ArrayShape correct = ArrayShape{LegionOrdered{}}; - - CHECK(result == correct); - } - } -} diff --git a/lib/kernels/test/src/kernels/compare_tensor_accessors.cc b/lib/kernels/test/src/kernels/compare_tensor_accessors.cc index 4e85dfdaa0..85ffa91315 100644 --- a/lib/kernels/test/src/kernels/compare_tensor_accessors.cc +++ b/lib/kernels/test/src/kernels/compare_tensor_accessors.cc @@ -1,5 +1,5 @@ #include "kernels/compare_tensor_accessors.h" -#include "kernels/test_utils.h" +#include "internal/test_utils.h" #include "kernels/create_accessor_with_contents.h" #include "kernels/format_accessor_contents.h" #include "test/utils/doctest/check_kv.h" diff --git a/lib/kernels/test/src/kernels/create_accessor_with_contents.cc b/lib/kernels/test/src/kernels/create_accessor_with_contents.cc index 69fa2728bf..3f073f2697 100644 --- a/lib/kernels/test/src/kernels/create_accessor_with_contents.cc +++ b/lib/kernels/test/src/kernels/create_accessor_with_contents.cc @@ -11,7 +11,7 @@ TEST_SUITE(FF_TEST_SUITE) { create_1d_accessor_w_with_contents({1, 4, 1, 2}, cpu_allocator); auto at = [&](nonnegative_int c) -> float { - return result.at(FFOrdered{c}); + return result.at(TensorDimsCoord{FFOrdered{c}}); }; CHECK(at(0_n) == 1); @@ -31,7 +31,7 @@ TEST_SUITE(FF_TEST_SUITE) { cpu_allocator); auto at = [&](nonnegative_int r, nonnegative_int c) -> float { - return result.at(FFOrdered{r, c}); + return result.at(TensorDimsCoord{FFOrdered{r, c}}); }; CHECK(at(0_n, 0_n) == 1); @@ -62,7 +62,7 @@ TEST_SUITE(FF_TEST_SUITE) { auto at = [&](nonnegative_int s, nonnegative_int r, nonnegative_int c) -> float { - return result.at(FFOrdered{s, r, c}); + return result.at(TensorDimsCoord{FFOrdered{s, r, c}}); }; CHECK(at(0_n, 0_n, 0_n) == 1); @@ -111,7 +111,8 @@ TEST_SUITE(FF_TEST_SUITE) { nonnegative_int s2, nonnegative_int r, nonnegative_int c) -> float { - return result.at(FFOrdered{s1, s2, r, c}); + return result.at( + TensorDimsCoord{FFOrdered{s1, s2, r, c}}); }; CHECK(at(0_n, 0_n, 0_n, 0_n) == 2); diff --git a/lib/kernels/test/src/kernels/format_accessor_contents.cc b/lib/kernels/test/src/kernels/format_accessor_contents.cc index a2b61b8dff..b4af5c9148 100644 --- a/lib/kernels/test/src/kernels/format_accessor_contents.cc +++ b/lib/kernels/test/src/kernels/format_accessor_contents.cc @@ -1,5 +1,5 @@ #include "kernels/format_accessor_contents.h" -#include "kernels/test_utils.h" +#include "internal/test_utils.h" #include "kernels/create_accessor_with_contents.h" #include "kernels/local_cpu_allocator.h" #include @@ -88,11 +88,67 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(result == correct); } - SUBCASE("accessor is some other dimension") { + SUBCASE("accessor is 4d") { GenericTensorAccessorR accessor = - create_4d_accessor_r_with_contents({{{{5}}}}, cpu_allocator); + create_4d_accessor_r_with_contents( + { + { + { + {2, 1, 1, 9}, + {1, 3, 6, 2}, + {1, 9, 8, 9}, + }, + { + {9, 2, 7, 6}, + {7, 2, 1, 1}, + {2, 8, 5, 6}, + }, + }, + { + { + {1, 2, 3, 6}, + {4, 3, 3, 9}, + {1, 1, 5, 1}, + }, + { + {4, 1, 8, 7}, + {9, 4, 2, 4}, + {1, 0, 0, 6}, + }, + }, + }, + cpu_allocator); - CHECK_THROWS(format_accessor_r_contents(accessor)); + std::string correct = "[\n" + " [\n" + " [\n" + " [2 1 1 9]\n" + " [1 3 6 2]\n" + " [1 9 8 9]\n" + " ]\n" + " [\n" + " [9 2 7 6]\n" + " [7 2 1 1]\n" + " [2 8 5 6]\n" + " ]\n" + " ]\n" + " [\n" + " [\n" + " [1 2 3 6]\n" + " [4 3 3 9]\n" + " [1 1 5 1]\n" + " ]\n" + " [\n" + " [4 1 8 7]\n" + " [9 4 2 4]\n" + " [1 0 0 6]\n" + " ]\n" + " ]\n" + "]"; + + std::string result = format_accessor_r_contents(accessor); + + CHECK(result == correct); } } } diff --git a/lib/kernels/test/src/kernels/legion_dim.cc b/lib/kernels/test/src/kernels/legion_dim.cc index 34822ed1c3..23401ffebe 100644 --- a/lib/kernels/test/src/kernels/legion_dim.cc +++ b/lib/kernels/test/src/kernels/legion_dim.cc @@ -7,7 +7,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("key_range(LegionOrdered)") { SUBCASE("input is non-empty") { - LegionOrdered input = {5, 3, 2, 3}; + LegionOrdered input = LegionOrdered{5, 3, 2, 3}; std::set result = key_range(input); std::set correct = { diff --git a/lib/kernels/test/src/kernels/legion_ordered/transform.cc b/lib/kernels/test/src/kernels/legion_ordered/transform.cc index 759507264f..e1846bd01a 100644 --- a/lib/kernels/test/src/kernels/legion_ordered/transform.cc +++ b/lib/kernels/test/src/kernels/legion_ordered/transform.cc @@ -19,7 +19,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { } SUBCASE("input is not empty") { - LegionOrdered input = {2, 1, 2, 5}; + LegionOrdered input = LegionOrdered{2, 1, 2, 5}; LegionOrdered result = transform(input, [](int x) { return fmt::to_string(x); }); diff --git a/lib/kernels/test/src/kernels/linear_kernels.cc b/lib/kernels/test/src/kernels/linear_kernels.cc new file mode 100644 index 0000000000..423e6be4f1 --- /dev/null +++ b/lib/kernels/test/src/kernels/linear_kernels.cc @@ -0,0 +1,263 @@ +#include "kernels/linear_kernels.h" +#include "internal/test_utils.h" +#include "kernels/copy_tensor_accessor.h" +#include "kernels/create_accessor_with_contents.h" +#include "kernels/create_local_allocator_for_device_type.h" +#include "kernels/device_handle_t.h" +#include "kernels/device_stream_t.h" +#include "kernels/format_accessor_contents.h" +#include "kernels/managed_per_device_ff_handle.h" +#include "test/utils/doctest/check_kv.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("linear_forward_kernel cpu-gpu alignment") { + Allocator local_cpu_allocator = create_local_cpu_memory_allocator(); + + // GenericTensorAccessorR toy_input = + // create_2d_accessor_r_with_contents( + // { + // {3, 3, 6}, + // {2, 1, 5}, + // {1, 2, -2}, + // {8, 0.5, -3}, + // }, + // local_cpu_allocator); + // float const *toy_arr = toy_input.get_float_ptr(); + // std::cout << toy_arr[0] << " " + // << toy_arr[1] << " " + // << toy_arr[2] << std::endl; + // + // Allocator local_cuda_allocator = create_local_cuda_memory_allocator(); + // GenericTensorAccessorW toy_cuda = + // local_cuda_allocator.allocate_tensor(toy_input.shape); + // copy_accessor_data_to_l_from_r(toy_cuda, toy_input); + // GenericTensorAccessorW toy_input2 = + // local_cpu_allocator.allocate_tensor(toy_input.shape); + // copy_accessor_data_to_l_from_r(toy_input2, + // read_only_accessor_from_write_accessor(toy_cuda)); CHECK_MESSAGE( + // accessors_are_equal(toy_input, toy_input2), + // check_kv("cpu_result", format_accessor_r_contents(toy_input)), + // check_kv("gpu_result", format_accessor_w_contents(toy_input2))); + + auto run_forward_kernel = [&](DeviceType device_type) { + Allocator allocator = create_local_allocator_for_device_type(device_type); + + GenericTensorAccessorR input = create_2d_accessor_r_with_contents( + { + {3, 3, 6}, + {2, 1, 5}, + {1, 2, -2}, + {8, 0.5, -3}, + }, + allocator); + + GenericTensorAccessorR projection = + create_2d_accessor_r_with_contents( + { + {1.0f, 2.0f, 1.5f}, + {0.5f, 4.0f, -1.0f}, + }, + allocator); + + GenericTensorAccessorR bias = + create_1d_accessor_r_with_contents({3.0, -1.0}, allocator); + + int batch_size = 4; + positive_int output_num_channels = 2_p; + + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{positive_int{batch_size}, output_num_channels}}, + DataType::FLOAT, + }; + + GenericTensorAccessorW output = allocator.allocate_tensor(output_shape); + + std::optional managed_handle = + create_local_handle_for_device_type( + device_type, + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); + + LinearAttrs attrs = LinearAttrs{ + /*out_channels=*/output_num_channels, + /*use_bias=*/true, + /*data_type=*/DataType::FLOAT, + /*activation=*/std::nullopt, + /*regularizer=*/std::nullopt, + }; + + std::optional per_device_state = linear_init_kernel( + /*device_type=*/device_type, + /*handle=*/device_handle_t_from_managed_handle(managed_handle), + /*activation=*/attrs.activation, + /*regularizer=*/attrs.regularizer, + /*use_bias=*/attrs.use_bias, + /*input_type=*/DataType::FLOAT, + /*weight_type=*/DataType::FLOAT, + /*output_type=*/DataType::FLOAT, + /*batch_size=*/batch_size, + /*output_num_channels=*/attrs.out_channels.int_from_positive_int()); + + device_stream_t stream = get_stream_for_device_type(device_type); + + linear_forward_kernel( + /*stream=*/stream, + /*per_device_state=*/per_device_state, + /*attrs=*/attrs, + /*input_accessor=*/input, + /*output_accessor=*/output, + /*projection_accessor=*/projection, + /*bias_accessor=*/bias); + + return copy_tensor_accessor_w(output, local_cpu_allocator); + }; + + GenericTensorAccessorW cpu_result = run_forward_kernel(DeviceType::CPU); + GenericTensorAccessorW gpu_result = run_forward_kernel(DeviceType::GPU); + + CHECK_MESSAGE( + accessors_are_equal(cpu_result, gpu_result), + check_kv("cpu_result", format_accessor_w_contents(cpu_result)), + check_kv("gpu_result", format_accessor_w_contents(gpu_result))); + } + + TEST_CASE("backward_kernel CPU/GPU alignment (Linear)") { + Allocator local_cpu_allocator = create_local_cpu_memory_allocator(); + + auto run_forward_kernel = [&](DeviceType device_type) { + Allocator allocator = create_local_allocator_for_device_type(device_type); + + GenericTensorAccessorR input = create_2d_accessor_r_with_contents( + { + {3, 3, 6}, + {2, 1, 5}, + {1, 2, -2}, + {8, 0.5, -3}, + }, + allocator); + + GenericTensorAccessorW input_grad = create_zero_filled_accessor_w( + get_tensor_shape_for_accessor_r(input), allocator); + + GenericTensorAccessorR projection = + create_2d_accessor_r_with_contents( + { + {1.0f, 2.0f, 1.5f}, + {0.5f, 4.0f, -1.0f}, + }, + allocator); + + GenericTensorAccessorW projection_grad = create_zero_filled_accessor_w( + get_tensor_shape_for_accessor_r(projection), allocator); + + GenericTensorAccessorR bias = + create_1d_accessor_r_with_contents({3.0, -1.0}, allocator); + + GenericTensorAccessorW bias_grad = create_zero_filled_accessor_w( + get_tensor_shape_for_accessor_r(bias), allocator); + + GenericTensorAccessorR output = create_2d_accessor_r_with_contents( + { + {21.0f, 6.5f}, + {14.5f, -1.0f}, + {5.0f, 9.5f}, + {7.5f, 8.0f}, + }, + allocator); + + GenericTensorAccessorR output_grad = + create_2d_accessor_r_with_contents( + { + {1.0f, -0.5f}, + {2.0f, -2.0f}, + {1.0f, 9.0f}, + {-3.5f, 1.0f}, + }, + allocator); + + int batch_size = 4; + positive_int output_num_channels = 2_p; + + LinearAttrs attrs = LinearAttrs{ + /*out_channels=*/output_num_channels, + /*use_bias=*/true, + /*data_type=*/DataType::FLOAT, + /*activation=*/std::nullopt, + /*regularizer=*/std::nullopt, + }; + + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{positive_int{batch_size}, + positive_int{output_num_channels}}}, + DataType::FLOAT, + }; + + std::optional managed_handle = + create_local_handle_for_device_type( + device_type, + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); + + std::optional per_device_state = linear_init_kernel( + /*device_type=*/device_type, + /*handle=*/device_handle_t_from_managed_handle(managed_handle), + /*activation=*/attrs.activation, + /*regularizer=*/attrs.regularizer, + /*use_bias=*/true, + /*input_type=*/DataType::FLOAT, + /*weight_type=*/DataType::FLOAT, + /*output_type=*/DataType::FLOAT, + /*batch_size=*/batch_size, + /*output_num_channels=*/attrs.out_channels.int_from_positive_int()); + + device_stream_t stream = get_stream_for_device_type(device_type); + + linear_backward_kernel( + /*stream=*/stream, + /*per_device_state=*/per_device_state, + /*attrs=*/attrs, + /*output=*/output, + /*output_grad=*/output_grad, + /*input=*/input, + /*input_grad=*/input_grad, + /*projection=*/projection, + /*projection_grad=*/projection_grad, + /*bias_grad=*/bias_grad); + + return std::tuple{ + copy_tensor_accessor_w(input_grad, local_cpu_allocator), + copy_tensor_accessor_w(projection_grad, local_cpu_allocator), + copy_tensor_accessor_w(bias_grad, local_cpu_allocator), + }; + }; + + auto cpu_results = run_forward_kernel(DeviceType::CPU); + GenericTensorAccessorW cpu_input_grad = std::get<0>(cpu_results); + GenericTensorAccessorW cpu_projection_grad = std::get<1>(cpu_results); + GenericTensorAccessorW cpu_bias_grad = std::get<2>(cpu_results); + + auto gpu_results = run_forward_kernel(DeviceType::GPU); + GenericTensorAccessorW gpu_input_grad = std::get<0>(gpu_results); + GenericTensorAccessorW gpu_projection_grad = std::get<1>(gpu_results); + GenericTensorAccessorW gpu_bias_grad = std::get<2>(gpu_results); + + CHECK_MESSAGE( + accessors_are_equal(cpu_input_grad, gpu_input_grad), + check_kv("cpu_input_grad", format_accessor_w_contents(cpu_input_grad)), + check_kv("gpu_input_grad", format_accessor_w_contents(gpu_input_grad))); + + CHECK_MESSAGE(accessors_are_equal(cpu_projection_grad, gpu_projection_grad), + check_kv("cpu_projection_grad", + format_accessor_w_contents(cpu_projection_grad)), + check_kv("gpu_projection_grad", + format_accessor_w_contents(gpu_projection_grad))); + + CHECK_MESSAGE( + accessors_are_equal(cpu_bias_grad, gpu_bias_grad), + check_kv("cpu_bias_grad", format_accessor_w_contents(cpu_bias_grad)), + check_kv("gpu_bias_grad", format_accessor_w_contents(gpu_bias_grad))); + } +} diff --git a/lib/kernels/test/src/kernels/linear_kernels_cpu.cc b/lib/kernels/test/src/kernels/linear_kernels_cpu.cc new file mode 100644 index 0000000000..0586fd7d1f --- /dev/null +++ b/lib/kernels/test/src/kernels/linear_kernels_cpu.cc @@ -0,0 +1,175 @@ +#include "kernels/linear_kernels_cpu.h" +#include "internal/test_utils.h" +#include "kernels/create_accessor_with_contents.h" +#include "kernels/format_accessor_contents.h" +#include "test/utils/doctest/check_kv.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("linear_cpu_forward_kernel") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + LinearAttrs attrs = LinearAttrs{ + /*out_channels=*/2_p, + /*use_bias=*/true, + /*data_type=*/DataType::FLOAT, + /*activation=*/std::nullopt, + /*regularizer=*/std::nullopt, + }; + + GenericTensorAccessorR input = create_2d_accessor_r_with_contents( + { + {3, 3, 6}, + {2, 1, 5}, + {1, 2, -2}, + {8, 0.5, -3}, + }, + cpu_allocator); + + GenericTensorAccessorR projection = + create_2d_accessor_r_with_contents( + { + {1.0f, 2.0f, 1.5f}, + {0.5f, 4.0f, -1.0f}, + }, + cpu_allocator); + + GenericTensorAccessorR bias = + create_1d_accessor_r_with_contents({3.0, -1.0}, cpu_allocator); + + GenericTensorAccessorW result = create_zero_filled_accessor_w( + TensorShape{ + TensorDims{FFOrdered{4_p, attrs.out_channels}}, + DataType::FLOAT, + }, + cpu_allocator); + + linear_cpu_forward_kernel( + /*attrs=*/attrs, + /*input=*/input, + /*output=*/result, + /*projection=*/projection, + /*bias=*/bias); + + GenericTensorAccessorR correct = create_2d_accessor_r_with_contents( + { + {21.0f, 6.5f}, + {14.5f, -1.0f}, + {5.0f, 9.5f}, + {7.5f, 8.0f}, + }, + cpu_allocator); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + check_kv("result", format_accessor_w_contents(result))); + } + + TEST_CASE("linear_cpu_backward_kernel") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + LinearAttrs attrs = LinearAttrs{ + /*out_channels=*/2_p, + /*use_bias=*/true, + /*data_type=*/DataType::FLOAT, + /*activation=*/std::nullopt, + /*regularizer=*/std::nullopt, + }; + + GenericTensorAccessorR input = create_2d_accessor_r_with_contents( + { + {3, 3, 6}, + {2, 1, 5}, + {1, 2, -2}, + {8, 0.5, -3}, + }, + cpu_allocator); + + GenericTensorAccessorW input_grad = + cpu_allocator.allocate_tensor(get_tensor_shape_for_accessor_r(input)); + + GenericTensorAccessorR projection = + create_2d_accessor_r_with_contents( + { + {1.0f, 2.0f, 1.5f}, + {0.5f, 4.0f, -1.0f}, + }, + cpu_allocator); + + GenericTensorAccessorW projection_grad = cpu_allocator.allocate_tensor( + get_tensor_shape_for_accessor_r(projection)); + + GenericTensorAccessorR bias = + create_1d_accessor_r_with_contents({3.0, -1.0}, cpu_allocator); + + GenericTensorAccessorW bias_grad = + cpu_allocator.allocate_tensor(get_tensor_shape_for_accessor_r(bias)); + + GenericTensorAccessorR output = create_2d_accessor_r_with_contents( + { + {21.0f, 6.5f}, + {14.5f, -1.0f}, + {5.0f, 9.5f}, + {7.5f, 8.0f}, + }, + cpu_allocator); + + GenericTensorAccessorR output_grad = + create_2d_accessor_r_with_contents( + { + {1.0f, -0.5f}, + {2.0f, -2.0f}, + {1.0f, 9.0f}, + {-3.5f, 1.0f}, + }, + cpu_allocator); + + linear_cpu_backward_kernel( + /*attrs=*/attrs, + /*output=*/output, + /*output_grad=*/output_grad, + /*input=*/input, + /*input_grad=*/input_grad, + /*projection=*/projection, + /*projection_grad=*/projection_grad, + /*bias_grad=*/bias_grad); + + GenericTensorAccessorR correct_input_grad = + create_2d_accessor_r_with_contents( + { + {0.75f, 0.0f, 2.0f}, + {1.0f, -4.0f, 5.0f}, + {5.5f, 38.0f, -7.5f}, + {-3.0f, -3.0f, -6.25f}, + }, + cpu_allocator); + + GenericTensorAccessorR correct_projection_grad = + create_2d_accessor_r_with_contents( + { + {-20.0f, 5.25f, 24.5f}, + {11.5f, 15.0f, -34.0f}, + }, + cpu_allocator); + + GenericTensorAccessorR correct_bias_grad = + create_1d_accessor_r_with_contents( + { + 1.0f + 2.0f + 1.0f + -3.5f, + -0.5f + -2.0f + 9.0f + 1.0f, + }, + cpu_allocator); + + CHECK_MESSAGE( + accessors_are_equal(input_grad, correct_input_grad), + check_kv("input_grad", format_accessor_w_contents(input_grad))); + + CHECK_MESSAGE(accessors_are_equal(projection_grad, correct_projection_grad), + check_kv("projection_grad", + format_accessor_w_contents(projection_grad))); + + CHECK_MESSAGE(accessors_are_equal(bias_grad, correct_bias_grad), + check_kv("bias_grad", format_accessor_w_contents(bias_grad))); + } +} diff --git a/lib/kernels/test/src/kernels/map_tensor_accessors.cc b/lib/kernels/test/src/kernels/map_tensor_accessors.cc index 60d7c76904..9b61786fc0 100644 --- a/lib/kernels/test/src/kernels/map_tensor_accessors.cc +++ b/lib/kernels/test/src/kernels/map_tensor_accessors.cc @@ -18,7 +18,7 @@ TEST_SUITE(FF_TEST_SUITE) { map_tensor_accessor_inplace(accessor, [](float x) { return x + 1; }); auto at = [&](nonnegative_int r, nonnegative_int c) -> float { - return accessor.at(FFOrdered{r, c}); + return accessor.at(TensorDimsCoord{FFOrdered{r, c}}); }; CHECK(at(0_n, 0_n) == 2); @@ -44,7 +44,7 @@ TEST_SUITE(FF_TEST_SUITE) { input, [](float x) { return x + 1; }, cpu_allocator); auto at = [&](nonnegative_int r, nonnegative_int c) -> float { - return result.at(FFOrdered{r, c}); + return result.at(TensorDimsCoord{FFOrdered{r, c}}); }; CHECK(at(0_n, 0_n) == 2); @@ -60,7 +60,7 @@ TEST_SUITE(FF_TEST_SUITE) { input, [](float x) -> bool { return x > 2; }, cpu_allocator); auto at = [&](nonnegative_int r, nonnegative_int c) -> bool { - return result.at(FFOrdered{r, c}); + return result.at(TensorDimsCoord{FFOrdered{r, c}}); }; CHECK(at(0_n, 0_n) == false); @@ -99,7 +99,7 @@ TEST_SUITE(FF_TEST_SUITE) { cpu_allocator); auto at = [&](nonnegative_int r, nonnegative_int c) -> float { - return result.at(FFOrdered{r, c}); + return result.at(TensorDimsCoord{FFOrdered{r, c}}); }; CHECK(at(0_n, 0_n) == 1); @@ -119,7 +119,7 @@ TEST_SUITE(FF_TEST_SUITE) { cpu_allocator); auto at = [&](nonnegative_int r, nonnegative_int c) -> bool { - return result.at(FFOrdered{r, c}); + return result.at(TensorDimsCoord{FFOrdered{r, c}}); }; CHECK(at(0_n, 0_n) == true); @@ -150,7 +150,7 @@ TEST_SUITE(FF_TEST_SUITE) { lhs, rhs, DataType::DOUBLE, func, cpu_allocator); auto at = [&](nonnegative_int r, nonnegative_int c) -> double { - return result.at(FFOrdered{r, c}); + return result.at(TensorDimsCoord{FFOrdered{r, c}}); }; CHECK(at(0_n, 0_n) == -1); diff --git a/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc b/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc index dd5f8e06f6..a269cf4777 100644 --- a/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc +++ b/lib/kernels/test/src/kernels/reduce_tensor_accessor.cc @@ -1,5 +1,5 @@ #include "kernels/reduce_tensor_accessor.h" -#include "kernels/test_utils.h" +#include "internal/test_utils.h" #include "kernels/create_accessor_with_contents.h" #include "kernels/format_accessor_contents.h" #include "test/utils/doctest/check_kv.h" diff --git a/lib/kernels/test/src/cpu/ops/reverse_kernels.cc b/lib/kernels/test/src/kernels/reverse_kernels_cpu.cc similarity index 99% rename from lib/kernels/test/src/cpu/ops/reverse_kernels.cc rename to lib/kernels/test/src/kernels/reverse_kernels_cpu.cc index 5e27b9d350..98ab88bd3f 100644 --- a/lib/kernels/test/src/cpu/ops/reverse_kernels.cc +++ b/lib/kernels/test/src/kernels/reverse_kernels_cpu.cc @@ -1,7 +1,7 @@ -#include "kernels/test_utils.h" +#include "kernels/reverse_kernels_cpu.h" +#include "internal/test_utils.h" #include "kernels/create_accessor_with_contents.h" #include "kernels/format_accessor_contents.h" -#include "kernels/reverse_kernels_cpu.h" #include "test/utils/doctest/check_kv.h" #include diff --git a/lib/kernels/test/src/kernels/tensor_accessor_unary_ops.cc b/lib/kernels/test/src/kernels/tensor_accessor_unary_ops.cc new file mode 100644 index 0000000000..e4660f4fc4 --- /dev/null +++ b/lib/kernels/test/src/kernels/tensor_accessor_unary_ops.cc @@ -0,0 +1,178 @@ +#include "kernels/tensor_accessor_unary_ops.h" +#include "internal/test_utils.h" +#include "kernels/create_accessor_with_contents.h" +#include "kernels/format_accessor_contents.h" +#include "test/utils/doctest/check_kv.h" +#include "utils/containers/repeat_element.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("tensor_accessor_scale_by_constant") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorR input = create_2d_accessor_r_with_contents( + { + {3, 3, 6}, + {0, -1, 0.75}, + }, + cpu_allocator); + + GenericTensorAccessorW result = + tensor_accessor_scale_by_constant(input, -2.0, cpu_allocator); + + GenericTensorAccessorR correct = create_2d_accessor_r_with_contents( + { + {-6, -6, -12}, + {0, 2, -1.5}, + }, + cpu_allocator); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + check_kv("result", format_accessor_w_contents(result))); + } + + TEST_CASE("tensor_accessor_relu") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorR input = create_2d_accessor_r_with_contents( + { + {3, -3, -6}, + {0, -1, 0.75}, + }, + cpu_allocator); + + GenericTensorAccessorW result = tensor_accessor_relu(input, cpu_allocator); + + GenericTensorAccessorR correct = create_2d_accessor_r_with_contents( + { + {3, 0, 0}, + {0, 0, 0.75}, + }, + cpu_allocator); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + check_kv("result", format_accessor_w_contents(result))); + } + + TEST_CASE("tensor_accessor_scale_by_constant_inplace") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorW input = create_2d_accessor_w_with_contents( + { + {3, 3, 6}, + {0, -1, 0.75}, + }, + cpu_allocator); + + tensor_accessor_scale_by_constant_inplace(input, -2.0); + + GenericTensorAccessorR correct = create_2d_accessor_r_with_contents( + { + {-6, -6, -12}, + {0, 2, -1.5}, + }, + cpu_allocator); + + CHECK_MESSAGE(accessors_are_equal(input, correct), + check_kv("result", format_accessor_w_contents(input))); + } + + TEST_CASE("tensor_accessor_broadcast") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorR input = create_2d_accessor_r_with_contents( + { + {3}, + {-0.5}, + {6}, + }, + cpu_allocator); + + TensorDims output_dims = TensorDims{ + FFOrdered{4_p, 1_p, 3_p, 2_p}, + }; + + GenericTensorAccessorW result = + tensor_accessor_broadcast(input, output_dims, cpu_allocator); + + GenericTensorAccessorR correct = create_4d_accessor_r_with_contents( + repeat_element(4_n, + std::vector>>{ + std::vector>{ + repeat_element(2_n, 3.0), + repeat_element(2_n, -0.5), + repeat_element(2_n, 6.0), + }}), + cpu_allocator); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + check_kv("input", format_accessor_r_contents(input)), + check_kv("result", format_accessor_w_contents(result))); + } + + TEST_CASE("tensor_accessor_transpose") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorR input = create_2d_accessor_r_with_contents( + { + {3, 3, 6}, + {0, -1, 0.75}, + }, + cpu_allocator); + + GenericTensorAccessorW result = + tensor_accessor_transpose(input, cpu_allocator); + + GenericTensorAccessorR correct = create_2d_accessor_r_with_contents( + { + {3, 0}, + {3, -1}, + {6, 0.75}, + }, + cpu_allocator); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + check_kv("result", format_accessor_w_contents(result))); + } + + TEST_CASE("tensor_accessor_reduce") { + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + GenericTensorAccessorR input = create_2d_accessor_r_with_contents( + { + {3, 3, 6}, + {0, -1, 0.75}, + }, + cpu_allocator); + + SUBCASE("inner dim") { + GenericTensorAccessorW result = + tensor_accessor_reduce(input, ff_dim_t{1_n}, cpu_allocator); + + GenericTensorAccessorR correct = + create_1d_accessor_r_with_contents( + { + 3 + 3 + 6, + 0 + (-1) + 0.75, + }, + cpu_allocator); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + check_kv("result", format_accessor_w_contents(result))); + } + + SUBCASE("outer_dim") { + GenericTensorAccessorW result = + tensor_accessor_reduce(input, ff_dim_t{0_n}, cpu_allocator); + + GenericTensorAccessorR correct = + create_1d_accessor_r_with_contents( + {(3 + 0), (3 + (-1)), (6 + 0.75)}, cpu_allocator); + + CHECK_MESSAGE(accessors_are_equal(result, correct), + check_kv("result", format_accessor_w_contents(result))); + } + } +} diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc index a086974a74..a313de72d5 100644 --- a/lib/kernels/test/src/test_attention_kernel.cc +++ b/lib/kernels/test/src/test_attention_kernel.cc @@ -1,5 +1,5 @@ -#include "kernels/test_utils.h" -#include "kernels/attention_kernels.h" +#include "internal/test_utils.h" +#include "kernels/attention_kernels_gpu.h" #include using namespace ::FlexFlow; @@ -25,7 +25,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); - MHAPerDeviceState state = Kernels::MultiHeadAttention::init_kernel( + MHAPerDeviceState state = Kernels::MultiHeadAttention::gpu_init_kernel( managed_handle.raw_handle(), allocator, /*num_samples=*/num_samples.int_from_positive_int(), @@ -71,11 +71,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { GenericTensorAccessorW weight_accessor = create_random_filled_accessor_w(weight_shape, allocator); - SUBCASE("forward_kernel") { + SUBCASE("gpu_forward_kernel") { GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); - Kernels::MultiHeadAttention::forward_kernel( + Kernels::MultiHeadAttention::gpu_forward_kernel( managed_stream.raw_stream(), state, query_accessor.get_float_ptr(), @@ -87,7 +87,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { CHECK(contains_non_zero(output_accessor)); } - SUBCASE("backward_kernel") { + SUBCASE("gpu_backward_kernel") { GenericTensorAccessorW query_grad_accessor = create_random_filled_accessor_w(query_shape, allocator); GenericTensorAccessorW key_grad_accessor = @@ -99,7 +99,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { GenericTensorAccessorW output_grad_accessor = create_random_filled_accessor_w(output_shape, allocator); - Kernels::MultiHeadAttention::backward_kernel( + Kernels::MultiHeadAttention::gpu_backward_kernel( managed_stream.raw_stream(), state, query_accessor.get_float_ptr(), @@ -113,6 +113,6 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { output_grad_accessor.get_float_ptr()); } - Kernels::MultiHeadAttention::cleanup_kernel(allocator, state); + Kernels::MultiHeadAttention::gpu_cleanup_kernel(allocator, state); } } diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc index b0fe356c95..8a904b7a0d 100644 --- a/lib/kernels/test/src/test_batch_matmul_kernel.cc +++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc @@ -1,5 +1,5 @@ -#include "kernels/test_utils.h" -#include "kernels/batch_matmul_kernels.h" +#include "internal/test_utils.h" +#include "kernels/batch_matmul_kernels_gpu.h" #include using namespace ::FlexFlow; @@ -41,22 +41,22 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { GenericTensorAccessorW output_accessor = create_random_filled_accessor_w(output_shape, allocator); - SUBCASE("forward_kernel") { - Kernels::BatchMatmul::forward_kernel(managed_stream.raw_stream(), - managed_handle.raw_handle(), - output_accessor.get_float_ptr(), - a_accessor.get_float_ptr(), - b_accessor.get_float_ptr(), - m.int_from_positive_int(), - n.int_from_positive_int(), - k.int_from_positive_int(), - batch.int_from_positive_int(), - a_seq_length_dim, - b_seq_length_dim, - seq_length); + SUBCASE("gpu_forward_kernel") { + Kernels::BatchMatmul::gpu_forward_kernel(managed_stream.raw_stream(), + managed_handle.raw_handle(), + output_accessor.get_float_ptr(), + a_accessor.get_float_ptr(), + b_accessor.get_float_ptr(), + m.int_from_positive_int(), + n.int_from_positive_int(), + k.int_from_positive_int(), + batch.int_from_positive_int(), + a_seq_length_dim, + b_seq_length_dim, + seq_length); } - SUBCASE("backward_kernel") { + SUBCASE("gpu_backward_kernel") { GenericTensorAccessorW o_grad_accessor = create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW a_grad_accessor = @@ -64,18 +64,18 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { GenericTensorAccessorW b_grad_accessor = allocator.allocate_tensor(input_shape_b); - Kernels::BatchMatmul::backward_kernel(managed_stream.raw_stream(), - managed_handle.raw_handle(), - output_accessor.get_float_ptr(), - o_grad_accessor.get_float_ptr(), - a_accessor.get_float_ptr(), - a_grad_accessor.get_float_ptr(), - b_accessor.get_float_ptr(), - b_grad_accessor.get_float_ptr(), - m.int_from_positive_int(), - n.int_from_positive_int(), - k.int_from_positive_int(), - batch.int_from_positive_int()); + Kernels::BatchMatmul::gpu_backward_kernel(managed_stream.raw_stream(), + managed_handle.raw_handle(), + output_accessor.get_float_ptr(), + o_grad_accessor.get_float_ptr(), + a_accessor.get_float_ptr(), + a_grad_accessor.get_float_ptr(), + b_accessor.get_float_ptr(), + b_grad_accessor.get_float_ptr(), + m.int_from_positive_int(), + n.int_from_positive_int(), + k.int_from_positive_int(), + batch.int_from_positive_int()); } } } diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc index c173fd6d24..1be45d8bbb 100644 --- a/lib/kernels/test/src/test_batch_norm_kernel.cc +++ b/lib/kernels/test/src/test_batch_norm_kernel.cc @@ -1,5 +1,5 @@ -#include "kernels/test_utils.h" -#include "kernels/batch_norm_kernels.h" +#include "internal/test_utils.h" +#include "kernels/batch_norm_kernels_gpu.h" #include "op-attrs/datatype_value.h" #include @@ -19,7 +19,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); - BatchNormPerDeviceState state = Kernels::BatchNorm::init_kernel( + BatchNormPerDeviceState state = Kernels::BatchNorm::gpu_init_kernel( /*handle=*/managed_handle.raw_handle(), /*allocator=*/allocator, /*runningMean=*/nullptr, @@ -53,11 +53,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { GenericTensorAccessorW scale_accessor = create_filled_accessor_w( scale_shape, allocator, make_float_data_type_value(1)); - SUBCASE("forward_kernel") { + SUBCASE("gpu_forward_kernel") { GenericTensorAccessorW bias_accessor = create_filled_accessor_w( bias_shape, allocator, make_float_data_type_value(0)); - Kernels::BatchNorm::forward_kernel( + Kernels::BatchNorm::gpu_forward_kernel( /*stream=*/managed_stream.raw_stream(), /*per_device_state=*/state, /*input_ptr=*/input_accessor.get_float_ptr(), @@ -68,7 +68,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { CHECK(contains_non_zero(output_accessor)); } - SUBCASE("backward_kernel") { + SUBCASE("gpu_backward_kernel") { GenericTensorAccessorW output_grad_accessor = create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = @@ -78,7 +78,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { GenericTensorAccessorW bias_grad_accessor = create_random_filled_accessor_w(bias_shape, allocator); - Kernels::BatchNorm::backward_kernel( + Kernels::BatchNorm::gpu_backward_kernel( /*stream=*/managed_stream.raw_stream(), /*per_device_state=*/state, /*output_ptr=*/output_accessor.get_float_ptr(), @@ -89,19 +89,13 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { /*scale_grad_ptr=*/scale_grad_accessor.get_float_ptr(), /*bias_grad_ptr=*/bias_grad_accessor.get_float_ptr(), /*numElements=*/ - input_accessor.shape.num_elements().int_from_positive_int()); + get_num_elements(input_accessor.shape.dims).int_from_positive_int()); CHECK(contains_non_zero(input_grad_accessor)); CHECK(contains_non_zero(scale_grad_accessor)); CHECK(contains_non_zero(bias_grad_accessor)); } - Kernels::BatchNorm::cleanup_kernel(allocator, - state.inputTensor, - state.biasTensor, - state.outputTensor, - state.actiDesc, - true, - state.runningMean); + Kernels::BatchNorm::gpu_cleanup_kernel(allocator, state); } } diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc index 9472e44a15..5657db71ef 100644 --- a/lib/kernels/test/src/test_cast_kernel.cc +++ b/lib/kernels/test/src/test_cast_kernel.cc @@ -1,6 +1,7 @@ -#include "kernels/test_utils.h" +#include "internal/test_utils.h" #include "kernels/cast_kernels.h" #include "kernels/cast_kernels_cpu.h" +#include "kernels/cast_kernels_gpu.h" #include using namespace ::FlexFlow; @@ -19,27 +20,27 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { DataType::DOUBLE, }; - SUBCASE("forward_kernel") { + SUBCASE("gpu_forward_kernel") { GenericTensorAccessorR input_accessor = create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); - Kernels::Cast::forward_kernel( + Kernels::Cast::gpu_forward_kernel( managed_stream.raw_stream(), input_accessor, output_accessor); CHECK(contains_non_zero(output_accessor)); } - SUBCASE("backward_kernel") { + SUBCASE("gpu_backward_kernel") { GenericTensorAccessorR grad_output_accessor = create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW grad_input_accessor = create_zero_filled_accessor_w(input_shape, allocator); - Kernels::Cast::backward_kernel(managed_stream.raw_stream(), - grad_output_accessor, - grad_input_accessor); + Kernels::Cast::gpu_backward_kernel(managed_stream.raw_stream(), + grad_output_accessor, + grad_input_accessor); CHECK(contains_non_zero(grad_input_accessor)); } @@ -68,7 +69,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { GenericTensorAccessorW output_accessor_gpu = create_zero_filled_accessor_w(output_shape, gpu_allocator); - Kernels::Cast::forward_kernel( + Kernels::Cast::gpu_forward_kernel( managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); // Run CPU Forward Kernel diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc deleted file mode 100644 index 7ac4d0f881..0000000000 --- a/lib/kernels/test/src/test_combine_kernel.cc +++ /dev/null @@ -1,106 +0,0 @@ -#include "kernels/test_utils.h" -#include "kernels/combine_kernels.h" -#include "kernels/combine_kernels_cpu.h" -#include - -using namespace ::FlexFlow; -TEST_SUITE(FF_CUDA_TEST_SUITE) { - TEST_CASE("Call Combine Forward and Backward Kernels") { - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true); - ManagedFFStream managed_stream{}; - - Allocator allocator = create_local_cuda_memory_allocator(); - - TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{100_p, 100_p}}, - DataType::FLOAT, - }; - TensorShape output_shape = input_shape; - - SUBCASE("forward_kernel") { - GenericTensorAccessorR input_accessor = - create_random_filled_accessor_r(input_shape, allocator); - GenericTensorAccessorW output_accessor = - allocator.allocate_tensor(output_shape); - - Kernels::Combine::forward_kernel( - managed_stream.raw_stream(), input_accessor, output_accessor); - - CHECK(contains_non_zero(output_accessor)); - } - - SUBCASE("backward_kernel") { - GenericTensorAccessorR output_grad_accessor = - create_random_filled_accessor_r(output_shape, allocator); - GenericTensorAccessorW input_grad_accessor = - allocator.allocate_tensor(input_shape); - - Kernels::Combine::backward_kernel(managed_stream.raw_stream(), - output_grad_accessor, - input_grad_accessor); - - CHECK(contains_non_zero(input_grad_accessor)); - } - } - - TEST_CASE("Check Combine Forward Kernel against CPU Kernel") { - ManagedFFStream managed_stream{}; - - Allocator gpu_allocator = create_local_cuda_memory_allocator(); - Allocator cpu_allocator = create_local_cpu_memory_allocator(); - - TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{5_p, 5_p}}, - DataType::FLOAT, - }; - TensorShape output_shape = input_shape; - - SUBCASE("forward_kernel") { - // Run GPU Combine Forward Kernel - GenericTensorAccessorR input_accessor_gpu = - create_random_filled_accessor_r(input_shape, gpu_allocator); - GenericTensorAccessorW output_accessor_gpu = - gpu_allocator.allocate_tensor(output_shape); - - Kernels::Combine::forward_kernel( - managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); - - // Run CPU Combine Forward Kernel - GenericTensorAccessorR input_accessor_cpu = - copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); - GenericTensorAccessorW output_accessor_cpu = - cpu_allocator.allocate_tensor(output_shape); - - Kernels::Combine::cpu_forward_kernel(input_accessor_cpu, - output_accessor_cpu); - - CHECK(accessors_are_equal(output_accessor_gpu, output_accessor_cpu)); - } - - SUBCASE("backward_kernel") { - // Run GPU Combine Backward Kernel - GenericTensorAccessorR output_grad_accessor_gpu = - create_random_filled_accessor_r(output_shape, gpu_allocator); - GenericTensorAccessorW input_grad_accessor_gpu = - create_zero_filled_accessor_w(input_shape, gpu_allocator); - - Kernels::Combine::backward_kernel(managed_stream.raw_stream(), - output_grad_accessor_gpu, - input_grad_accessor_gpu); - - // Run CPU Combine Backward Kernel - GenericTensorAccessorR output_grad_accessor_cpu = - copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); - GenericTensorAccessorW input_grad_accessor_cpu = - create_zero_filled_accessor_w(input_shape, cpu_allocator); - - Kernels::Combine::cpu_backward_kernel(output_grad_accessor_cpu, - input_grad_accessor_cpu); - - CHECK(accessors_are_equal(input_grad_accessor_gpu, - input_grad_accessor_cpu)); - } - } -} diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc index 5dc8e441bd..e3fdd3ad61 100644 --- a/lib/kernels/test/src/test_concat_kernel.cc +++ b/lib/kernels/test/src/test_concat_kernel.cc @@ -1,5 +1,5 @@ -#include "kernels/test_utils.h" -#include "kernels/concat_kernels.h" +#include "internal/test_utils.h" +#include "kernels/concat_kernels_gpu.h" #include "utils/containers/repeat.h" #include @@ -14,7 +14,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { const positive_int num_inputs = 4_p; - SUBCASE("forward_kernel") { + SUBCASE("gpu_forward_kernel") { auto run_forward_test = [&](positive_int input_rows, positive_int input_cols, TensorShape output_shape, @@ -32,10 +32,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); - Kernels::Concat::forward_kernel(managed_stream.raw_stream(), - output_accessor, - input_accessors, - concat_axis); + Kernels::Concat::gpu_forward_kernel(managed_stream.raw_stream(), + output_accessor, + input_accessors, + concat_axis); CHECK(contains_non_zero(output_accessor)); }; @@ -61,7 +61,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { } } - SUBCASE("backward_kernel") { + SUBCASE("gpu_backward_kernel") { auto run_backward_test = [&](positive_int input_rows, positive_int input_cols, TensorShape output_shape, @@ -79,10 +79,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { return create_zero_filled_accessor_w(input_shape, allocator); }); - Kernels::Concat::backward_kernel(managed_stream.raw_stream(), - output_grad_accessor, - input_grad_accessors, - concat_axis); + Kernels::Concat::gpu_backward_kernel(managed_stream.raw_stream(), + output_grad_accessor, + input_grad_accessors, + concat_axis); for (auto &accessor : input_grad_accessors) { CHECK(contains_non_zero(accessor)); diff --git a/lib/kernels/test/src/test_cuda.cc b/lib/kernels/test/src/test_cuda.cc index 60bc6251b2..de3215cf2d 100644 --- a/lib/kernels/test/src/test_cuda.cc +++ b/lib/kernels/test/src/test_cuda.cc @@ -1,4 +1,4 @@ -#include "kernels/test_utils.h" +#include "internal/test_utils.h" #include #include diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc index fb8b8dc87c..f6048e6771 100644 --- a/lib/kernels/test/src/test_dropout.cc +++ b/lib/kernels/test/src/test_dropout.cc @@ -1,5 +1,5 @@ -#include "kernels/test_utils.h" -#include "kernels/dropout_kernels.h" +#include "internal/test_utils.h" +#include "kernels/dropout_kernels_gpu.h" #include "utils/containers/count.h" #include @@ -9,10 +9,6 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { unsigned long long seed = 12345; float dropout_rate = 0.1; - ArrayShape shape = ArrayShape{ - std::vector{10_p, 10_p}, - }; - TensorShape input_shape = TensorShape{ TensorDims{FFOrdered{10_p, 10_p}}, DataType::FLOAT, @@ -26,8 +22,12 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); - DropoutPerDeviceState state = Kernels::Dropout::init_kernel( - managed_handle.raw_handle(), dropout_rate, seed, shape, allocator); + DropoutPerDeviceState state = + Kernels::Dropout::gpu_init_kernel(managed_handle.raw_handle(), + dropout_rate, + seed, + output_shape, + allocator); SUBCASE("forward_kernel") { GenericTensorAccessorR input_accessor = @@ -35,10 +35,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); - Kernels::Dropout::forward_kernel(managed_stream.raw_stream(), - state, - input_accessor.get_float_ptr(), - output_accessor.get_float_ptr()); + Kernels::Dropout::gpu_forward_kernel(managed_stream.raw_stream(), + state, + input_accessor.get_float_ptr(), + output_accessor.get_float_ptr()); CHECK(contains_non_zero(output_accessor)); } @@ -49,16 +49,12 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { GenericTensorAccessorW input_grad_data = create_random_filled_accessor_w(input_shape, allocator); - Kernels::Dropout::backward_kernel(managed_stream.raw_stream(), - state, - output_grad_data.get_float_ptr(), - input_grad_data.get_float_ptr()); + Kernels::Dropout::gpu_backward_kernel(managed_stream.raw_stream(), + state, + output_grad_data.get_float_ptr(), + input_grad_data.get_float_ptr()); } - Kernels::Dropout::cleanup_kernel(allocator, - state.inputTensor, - state.outputTensor, - state.dropoutDesc, - state.dropoutStates); + Kernels::Dropout::gpu_cleanup_kernel(allocator, state); } } diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc index cea07ce781..71831ced48 100644 --- a/lib/kernels/test/src/test_flat_kernel.cc +++ b/lib/kernels/test/src/test_flat_kernel.cc @@ -1,5 +1,5 @@ -#include "kernels/test_utils.h" -#include "kernels/flat_kernels.h" +#include "internal/test_utils.h" +#include "kernels/flat_kernels_gpu.h" #include "op-attrs/datatype_value.h" #include @@ -23,27 +23,27 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { read_only_accessor_from_write_accessor(create_filled_accessor_w( input_shape, allocator, make_float_data_type_value(2))); - SUBCASE("forward_kernel") { + SUBCASE("gpu_forward_kernel") { GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); - Kernels::Flat::forward_kernel(managed_stream.raw_stream(), - input_accessor, - output_accessor.get_float_ptr()); + Kernels::Flat::gpu_forward_kernel(managed_stream.raw_stream(), + input_accessor, + output_accessor.get_float_ptr()); CHECK(contains_non_zero(output_accessor)); } - SUBCASE("backward_kernel") { + SUBCASE("gpu_backward_kernel") { GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r( output_shape, allocator, make_float_data_type_value(0)); GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w( input_shape, allocator, make_float_data_type_value(1)); - Kernels::Flat::backward_kernel(managed_stream.raw_stream(), - input_accessor, - output_grad_accessor.get_float_ptr(), - input_grad_accessor.get_float_ptr()); + Kernels::Flat::gpu_backward_kernel(managed_stream.raw_stream(), + input_accessor, + output_grad_accessor.get_float_ptr(), + input_grad_accessor.get_float_ptr()); CHECK(contains_non_zero(input_grad_accessor)); } diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc index 6a553bd107..d08058b063 100644 --- a/lib/kernels/test/src/test_gather_kernels.cc +++ b/lib/kernels/test/src/test_gather_kernels.cc @@ -1,5 +1,5 @@ -#include "kernels/test_utils.h" -#include "kernels/gather_kernels.h" +#include "internal/test_utils.h" +#include "kernels/gather_kernels_gpu.h" #include using namespace ::FlexFlow; @@ -13,13 +13,17 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); - GatherPerDeviceState state = {managed_handle.raw_handle(), - legion_dim_t{0_n}}; - - SUBCASE("forward_kernel") { + SUBCASE("gpu_forward_kernel") { auto run_forward_test = [&](TensorShape input_shape, TensorShape index_shape, TensorShape output_shape) { + ff_dim_t dim = ff_dim_t{ + nonnegative_int{ + get_num_dims(input_shape.dims).unwrap_nonnegative() - 1}, + }; + GatherPerDeviceState state = + Kernels::Gather::gpu_init_kernel(managed_handle.raw_handle(), dim); + GenericTensorAccessorR input_accessor = create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorR index_accessor = @@ -27,11 +31,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); - Kernels::Gather::forward_kernel(managed_stream.raw_stream(), - state, - input_accessor, - index_accessor, - output_accessor); + Kernels::Gather::gpu_forward_kernel(managed_stream.raw_stream(), + state, + input_accessor, + index_accessor, + output_accessor); CHECK(contains_non_zero(output_accessor)); }; @@ -69,10 +73,17 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { } } - SUBCASE("backward_kernel") { + SUBCASE("gpu_backward_kernel") { auto run_backward_test = [&](TensorShape input_shape, TensorShape index_shape, TensorShape output_shape) { + ff_dim_t dim = ff_dim_t{ + nonnegative_int{ + get_num_dims(input_shape.dims).unwrap_nonnegative() - 1}, + }; + GatherPerDeviceState state = + Kernels::Gather::gpu_init_kernel(managed_handle.raw_handle(), dim); + GenericTensorAccessorR output_grad_accessor = create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorR index_accessor = @@ -80,11 +91,11 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); - Kernels::Gather::backward_kernel(managed_stream.raw_stream(), - state, - output_grad_accessor, - index_accessor, - input_grad_accessor); + Kernels::Gather::gpu_backward_kernel(managed_stream.raw_stream(), + state, + output_grad_accessor, + index_accessor, + input_grad_accessor); CHECK(contains_non_zero(input_grad_accessor)); }; diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc index 5382bb3a84..e6796b5768 100644 --- a/lib/kernels/test/src/test_layer_norm_kernels.cc +++ b/lib/kernels/test/src/test_layer_norm_kernels.cc @@ -1,5 +1,5 @@ -#include "kernels/test_utils.h" -#include "kernels/layer_norm_kernels.h" +#include "internal/test_utils.h" +#include "kernels/layer_norm_kernels_gpu.h" #include "op-attrs/datatype_value.h" #include @@ -29,34 +29,34 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); - LayerNormPerDeviceState state = - Kernels::LayerNorm::init_kernel(managed_handle.raw_handle(), - allocator, - elementwise_affine, - batch_size.int_from_positive_int(), - feature_size.int_from_positive_int(), - epsilon); + LayerNormPerDeviceState state = Kernels::LayerNorm::gpu_init_kernel( + managed_handle.raw_handle(), + allocator, + elementwise_affine, + batch_size.int_from_positive_int(), + feature_size.int_from_positive_int(), + epsilon); GenericTensorAccessorR input_accessor = create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW gamma_accessor = create_filled_accessor_w( feature_shape, allocator, make_float_data_type_value(1)); - SUBCASE("forward_kernel") { + SUBCASE("gpu_forward_kernel") { GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); GenericTensorAccessorW beta_accessor = create_filled_accessor_w( feature_shape, allocator, make_float_data_type_value(0)); - Kernels::LayerNorm::forward_kernel(managed_stream.raw_stream(), - state, - input_accessor, - output_accessor, - gamma_accessor, - beta_accessor); + Kernels::LayerNorm::gpu_forward_kernel(managed_stream.raw_stream(), + state, + input_accessor, + output_accessor, + gamma_accessor, + beta_accessor); } - SUBCASE("backward_kernel") { + SUBCASE("gpu_backward_kernel") { GenericTensorAccessorR output_grad_accessor = create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = @@ -66,7 +66,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { GenericTensorAccessorW beta_grad_accessor = allocator.allocate_tensor(feature_shape); - Kernels::LayerNorm::backward_kernel( + Kernels::LayerNorm::gpu_backward_kernel( managed_stream.raw_stream(), state, output_grad_accessor, diff --git a/lib/kernels/test/src/test_managed_ff_stream.cc b/lib/kernels/test/src/test_managed_ff_stream.cc deleted file mode 100644 index 25a346446b..0000000000 --- a/lib/kernels/test/src/test_managed_ff_stream.cc +++ /dev/null @@ -1,107 +0,0 @@ -#include "kernels/test_utils.h" -#include "kernels/gather_kernels.h" -#include - -using namespace ::FlexFlow; - -TEST_SUITE(FF_CUDA_TEST_SUITE) { - TEST_CASE("Test ManagedFFStream") { - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true); - ManagedFFStream managed_stream{}; - Allocator allocator = create_local_cuda_memory_allocator(); - - GatherPerDeviceState state = {managed_handle.raw_handle(), - legion_dim_t{0_n}}; - - SUBCASE("forward_kernel") { - auto run_forward_test = [&](TensorShape const &input_shape, - TensorShape const &index_shape, - TensorShape const &output_shape) { - GenericTensorAccessorR input_accessor = - create_random_filled_accessor_r(input_shape, allocator); - GenericTensorAccessorR index_accessor = - create_random_filled_accessor_r(index_shape, allocator); - GenericTensorAccessorW output_accessor = - allocator.allocate_tensor(output_shape); - - Kernels::Gather::forward_kernel(/*stream=*/managed_stream.raw_stream(), - /*per_device_state=*/state, - /*input=*/input_accessor, - /*index=*/index_accessor, - /*output=*/output_accessor); - - CHECK(contains_non_zero(output_accessor)); - }; - - SUBCASE("test gather forward, 2D") { - TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{2_p, 100_p}}, - DataType::FLOAT, - }; - TensorShape index_shape = TensorShape{ - TensorDims{FFOrdered{2_p, 20_p}}, - DataType::INT32, - }; - TensorShape output_shape = TensorShape{ - TensorDims{FFOrdered{2_p, 20_p}}, - DataType::FLOAT, - }; - run_forward_test(input_shape, index_shape, output_shape); - } - - SUBCASE("test gather forward, 1D") { - TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{100_p}}, - DataType::FLOAT, - }; - TensorShape index_shape = TensorShape{ - TensorDims{FFOrdered{10_p}}, - DataType::INT32, - }; - TensorShape output_shape = TensorShape{ - TensorDims{FFOrdered{10_p}}, - DataType::FLOAT, - }; - run_forward_test(input_shape, index_shape, output_shape); - } - } - - SUBCASE("backward_kernel") { - auto run_backward_test = [&](TensorShape const &input_shape, - TensorShape const &index_shape, - TensorShape const &output_shape) { - GenericTensorAccessorR output_grad_accessor = - create_random_filled_accessor_r(output_shape, allocator); - GenericTensorAccessorR index_accessor = - create_random_filled_accessor_r(index_shape, allocator); - GenericTensorAccessorW input_grad_accessor = - allocator.allocate_tensor(input_shape); - - Kernels::Gather::backward_kernel(/*stream=*/managed_stream.raw_stream(), - /*per_device_state=*/state, - /*output_grad=*/output_grad_accessor, - /*index=*/index_accessor, - /*input_grad=*/input_grad_accessor); - CHECK(contains_non_zero(input_grad_accessor)); - }; - - SUBCASE("test gather backward, 2D") { - TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{2_p, 100_p}}, - DataType::FLOAT, - }; - TensorShape index_shape = TensorShape{ - TensorDims{FFOrdered{2_p, 25_p}}, - DataType::INT32, - }; - TensorShape output_shape = TensorShape{ - TensorDims{FFOrdered{2_p, 25_p}}, - DataType::FLOAT, - }; - run_backward_test(input_shape, index_shape, output_shape); - } - } - } -} diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc deleted file mode 100644 index c042ae3175..0000000000 --- a/lib/kernels/test/src/test_partition_kernel.cc +++ /dev/null @@ -1,52 +0,0 @@ -#include "kernels/test_utils.h" -#include "kernels/partition_kernels.h" -#include "op-attrs/datatype_value.h" -#include - -using namespace ::FlexFlow; - -TEST_SUITE(FF_CUDA_TEST_SUITE) { - TEST_CASE("Test Partition Forward and Backward") { - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true); - ManagedFFStream managed_stream{}; - - Allocator allocator = create_local_cuda_memory_allocator(); - - RepartitionPerDeviceState state = Kernels::Repartition::init_kernel( - managed_handle.raw_handle(), DataType::FLOAT); - - TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{10_p, 10_p}}, - DataType::FLOAT, - }; - TensorShape output_shape = input_shape; - - SUBCASE("forward_kernel") { - GenericTensorAccessorR input_accessor = create_filled_accessor_r( - input_shape, allocator, make_float_data_type_value(1)); - GenericTensorAccessorW output_accessor = - allocator.allocate_tensor(output_shape); - - Kernels::Repartition::forward_kernel( - managed_stream.raw_stream(), state, input_accessor, output_accessor); - - CHECK(contains_non_zero(output_accessor)); - } - - SUBCASE("backward_kernel") { - GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r( - output_shape, allocator, make_float_data_type_value(1)); - GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w( - input_shape, allocator, make_float_data_type_value(2)); - - Kernels::Repartition::backward_kernel(managed_stream.raw_stream(), - state, - output_grad_accessor, - input_grad_accessor); - - CHECK(contains_non_zero(input_grad_accessor)); - } - } -} diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc index 58fff5c884..5aa32899cb 100644 --- a/lib/kernels/test/src/test_pool_2d_kernels.cc +++ b/lib/kernels/test/src/test_pool_2d_kernels.cc @@ -1,5 +1,5 @@ -#include "kernels/test_utils.h" -#include "kernels/pool_2d_kernels.h" +#include "internal/test_utils.h" +#include "kernels/pool_2d_kernels_gpu.h" #include "op-attrs/datatype_value.h" #include @@ -30,7 +30,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); - Pool2DPerDeviceState state = Kernels::Pool2D::init_kernel( + Pool2DPerDeviceState state = Kernels::Pool2D::gpu_init_kernel( /*handle=*/managed_handle.raw_handle(), /*activation=*/std::nullopt, /*input_w=*/input_w.int_from_positive_int(), @@ -63,27 +63,27 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { GenericTensorAccessorW output_accessor = create_random_filled_accessor_w(output_shape, allocator); - SUBCASE("forward_kernel") { - Kernels::Pool2D::forward_kernel(managed_stream.raw_stream(), - state, - input_accessor.ptr, - output_accessor.ptr); + SUBCASE("gpu_forward_kernel") { + Kernels::Pool2D::gpu_forward_kernel(managed_stream.raw_stream(), + state, + input_accessor.ptr, + output_accessor.ptr); CHECK(contains_non_zero(output_accessor)); } - SUBCASE("backward_kernel") { + SUBCASE("gpu_backward_kernel") { GenericTensorAccessorW output_grad_accessor = create_filled_accessor_w( output_shape, allocator, make_float_data_type_value(1)); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); - Kernels::Pool2D::backward_kernel(managed_stream.raw_stream(), - state, - output_accessor.ptr, - output_grad_accessor.ptr, - input_accessor.ptr, - input_grad_accessor.ptr); + Kernels::Pool2D::gpu_backward_kernel(managed_stream.raw_stream(), + state, + output_accessor.ptr, + output_grad_accessor.ptr, + input_accessor.ptr, + input_grad_accessor.ptr); CHECK(contains_non_zero(input_grad_accessor)); } diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc deleted file mode 100644 index 4d030c4d93..0000000000 --- a/lib/kernels/test/src/test_reduction_kernel.cc +++ /dev/null @@ -1,57 +0,0 @@ -#include "kernels/test_utils.h" -#include "kernels/reduction_kernels.h" -#include "op-attrs/datatype_value.h" -#include - -using namespace ::FlexFlow; -TEST_SUITE(FF_CUDA_TEST_SUITE) { - TEST_CASE("Test Reduction Forward and Backward Kernel") { - std::size_t num_replicas = 5; - - TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{10_p, 10_p, 10_p, 10_p, 10_p}}, - DataType::FLOAT, - }; - - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true); - ManagedFFStream managed_stream{}; - - Allocator allocator = create_local_cuda_memory_allocator(); - - SUBCASE("forward_kernel") { - TensorShape output_shape = TensorShape{ - TensorDims{FFOrdered{10_p}}, - DataType::FLOAT, - }; - - GenericTensorAccessorR input_accessor = - create_random_filled_accessor_r(input_shape, allocator); - GenericTensorAccessorW output_accessor = - allocator.allocate_tensor(output_shape); - - Kernels::Reduction::forward_kernel(managed_stream.raw_stream(), - input_accessor, - output_accessor, - num_replicas); - - CHECK(contains_non_zero(output_accessor)); - } - - SUBCASE("backward_kernel") { - TensorShape output_shape = input_shape; - - GenericTensorAccessorR output_grad_accessor = create_filled_accessor_r( - output_shape, allocator, make_float_data_type_value(1)); - GenericTensorAccessorW input_grad_accessor = - allocator.allocate_tensor(input_shape); - - Kernels::Reduction::backward_kernel(managed_stream.raw_stream(), - output_grad_accessor, - input_grad_accessor); - - CHECK(contains_non_zero(input_grad_accessor)); - } - } -} diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc deleted file mode 100644 index 9806cefe8d..0000000000 --- a/lib/kernels/test/src/test_replicate_kernel.cc +++ /dev/null @@ -1,153 +0,0 @@ -#include "kernels/test_utils.h" -#include "kernels/create_accessor_with_contents.h" -#include "kernels/format_accessor_contents.h" -#include "kernels/replicate_kernels.h" -#include "kernels/replicate_kernels_cpu.h" -#include "test/utils/doctest/check_kv.h" -#include - -using namespace ::FlexFlow; - -TEST_SUITE(FF_CUDA_TEST_SUITE) { - TEST_CASE("Call Replicate Forward and Backward Kernels") { - nonnegative_int num_replicas = 10_n; - - TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{3_p}}, - DataType::FLOAT, - }; - TensorShape output_shape = TensorShape{ - TensorDims{FFOrdered{3_p}}, - DataType::FLOAT, - }; - - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true); - ManagedFFStream managed_stream{}; - - Allocator gpu_allocator = create_local_cuda_memory_allocator(); - Allocator cpu_allocator = create_local_cpu_memory_allocator(); - - SUBCASE("forward_kernel") { - GenericTensorAccessorR input = - create_1d_accessor_r_with_contents({1, 3, 2}, gpu_allocator); - - GenericTensorAccessorW output = - gpu_allocator.allocate_tensor(output_shape); - - Kernels::Replicate::forward_kernel( - managed_stream.raw_stream(), input, output); - - GenericTensorAccessorR correct = input; - - CHECK_MESSAGE(accessors_are_equal(output, correct), - check_kv("output", format_accessor_w_contents(output))); - } - - SUBCASE("backward_kernel") { - GenericTensorAccessorR output_grad = - create_2d_accessor_r_with_contents( - { - {1, 2, 3}, - {4, 3, 3}, - {1, 3, 5}, - }, - gpu_allocator); - - GenericTensorAccessorR correct = - create_1d_accessor_r_with_contents( - {1 + 2 + 3, 4 + 3 + 3, 1 + 3 + 5}, cpu_allocator); - - GenericTensorAccessorW input_grad = - gpu_allocator.allocate_tensor(input_shape); - - Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), - output_grad, - input_grad, - num_replicas.unwrap_nonnegative()); - - CHECK_MESSAGE( - accessors_are_equal(input_grad, correct), - check_kv("input_grad", format_accessor_w_contents(input_grad))); - } - } - - TEST_CASE("Check Replicate Forward and Backward Kernel against CPU Kernel") { - positive_int num_replicas = 2_p; - - TensorShape input_shape = TensorShape{ - TensorDims{FFOrdered{5_p}}, - DataType::FLOAT, - }; - TensorShape output_shape = TensorShape{ - TensorDims{FFOrdered{5_p, num_replicas}}, - DataType::FLOAT, - }; - - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true); - ManagedFFStream managed_stream{}; - - Allocator gpu_allocator = create_local_cuda_memory_allocator(); - Allocator cpu_allocator = create_local_cpu_memory_allocator(); - - SUBCASE("forward_kernel") { - // Run GPU Replicate Forward Kernel - GenericTensorAccessorR input_accessor_gpu = - create_random_filled_accessor_r(input_shape, gpu_allocator); - GenericTensorAccessorW output_accessor_gpu = - create_zero_filled_accessor_w(output_shape, gpu_allocator); - - Kernels::Replicate::forward_kernel( - managed_stream.raw_stream(), input_accessor_gpu, output_accessor_gpu); - - // Run CPU Replicate Forward Kernel - GenericTensorAccessorR input_accessor_cpu = - copy_tensor_accessor_r(input_accessor_gpu, cpu_allocator); - GenericTensorAccessorW output_accessor_cpu = - create_zero_filled_accessor_w(output_shape, cpu_allocator); - - Kernels::Replicate::cpu_forward_kernel(input_accessor_cpu, - output_accessor_cpu); - - CHECK_MESSAGE( - accessors_are_equal(output_accessor_gpu, output_accessor_cpu), - check_kv("input", format_accessor_r_contents(input_accessor_cpu)), - check_kv("gpu", format_accessor_w_contents(output_accessor_gpu)), - check_kv("cpu", format_accessor_w_contents(output_accessor_cpu))); - } - - SUBCASE("backward_kernel") { - // Run GPU Replicate Backward Kernel - GenericTensorAccessorR output_grad_accessor_gpu = - create_random_filled_accessor_r(output_shape, gpu_allocator); - GenericTensorAccessorW input_grad_accessor_gpu = - create_zero_filled_accessor_w(input_shape, gpu_allocator); - - Kernels::Replicate::backward_kernel(managed_stream.raw_stream(), - output_grad_accessor_gpu, - input_grad_accessor_gpu, - num_replicas.int_from_positive_int()); - - // Run CPU Replicate Backward Kernel - GenericTensorAccessorR output_grad_accessor_cpu = - copy_tensor_accessor_r(output_grad_accessor_gpu, cpu_allocator); - GenericTensorAccessorW input_grad_accessor_cpu = - create_zero_filled_accessor_w(input_shape, cpu_allocator); - - Kernels::Replicate::cpu_backward_kernel( - output_grad_accessor_cpu, - input_grad_accessor_cpu, - num_replicas.int_from_positive_int()); - - CHECK_MESSAGE( - accessors_are_equal(input_grad_accessor_gpu, input_grad_accessor_cpu), - check_kv("output_grad", - format_accessor_r_contents(output_grad_accessor_cpu)), - check_kv("gpu", format_accessor_w_contents(input_grad_accessor_gpu)), - check_kv("cpu", format_accessor_w_contents(input_grad_accessor_cpu))); - } - } -} diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc index 011f35e567..ad598c9055 100644 --- a/lib/kernels/test/src/test_reshape_kernel.cc +++ b/lib/kernels/test/src/test_reshape_kernel.cc @@ -1,5 +1,5 @@ -#include "kernels/test_utils.h" -#include "kernels/reshape_kernels.h" +#include "internal/test_utils.h" +#include "kernels/reshape_kernels_gpu.h" #include using namespace ::FlexFlow; @@ -16,33 +16,32 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { TensorDims{FFOrdered{100_p}}, DataType::FLOAT, }; - TensorShape output_shape = input_shape; - - ReshapePerDeviceState state = - Kernels::Reshape::init_kernel(DataType::FLOAT); + TensorShape output_shape = TensorShape{ + TensorDims{FFOrdered{100_p}}, + DataType::INT32, + }; - SUBCASE("forward_kernel") { + SUBCASE("gpu_forward_kernel") { GenericTensorAccessorR input_accessor = create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); - Kernels::Reshape::forward_kernel( - managed_stream.raw_stream(), state, input_accessor, output_accessor); + Kernels::Reshape::gpu_forward_kernel( + managed_stream.raw_stream(), input_accessor, output_accessor); CHECK(contains_non_zero(output_accessor)); } - SUBCASE("backward_kernel") { + SUBCASE("gpu_backward_kernel") { GenericTensorAccessorR output_grad_accessor = create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); - Kernels::Reshape::backward_kernel(managed_stream.raw_stream(), - state, - output_grad_accessor, - input_grad_accessor); + Kernels::Reshape::gpu_backward_kernel(managed_stream.raw_stream(), + output_grad_accessor, + input_grad_accessor); CHECK(contains_non_zero(input_grad_accessor)); } diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc index fc5c8deaad..731b530910 100644 --- a/lib/kernels/test/src/test_reverse_kernels.cc +++ b/lib/kernels/test/src/test_reverse_kernels.cc @@ -1,6 +1,6 @@ -#include "kernels/test_utils.h" -#include "kernels/reverse_kernels.h" +#include "internal/test_utils.h" #include "kernels/reverse_kernels_cpu.h" +#include "kernels/reverse_kernels_gpu.h" #include "op-attrs/datatype_value.h" #include @@ -24,29 +24,29 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { /*axis=*/ff_dim_t{0_n}, }; - SUBCASE("forward_kernel") { + SUBCASE("gpu_forward_kernel") { GenericTensorAccessorR input_accessor = read_only_accessor_from_write_accessor(create_filled_accessor_w( input_shape, allocator, make_float_data_type_value(1))); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); - Kernels::Reverse::forward_kernel( + Kernels::Reverse::gpu_forward_kernel( managed_stream.raw_stream(), input_accessor, output_accessor, attrs); CHECK(contains_non_zero(output_accessor)); } - SUBCASE("backward_kernel") { + SUBCASE("gpu_backward_kernel") { GenericTensorAccessorW output_grad_accessor = create_random_filled_accessor_w(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); - Kernels::Reverse::backward_kernel(managed_stream.raw_stream(), - output_grad_accessor, - input_grad_accessor, - attrs); + Kernels::Reverse::gpu_backward_kernel(managed_stream.raw_stream(), + output_grad_accessor, + input_grad_accessor, + attrs); CHECK(contains_non_zero(input_grad_accessor)); } @@ -71,17 +71,17 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { /*axis=*/ff_dim_t{0_n}, }; - SUBCASE("forward_kernel") { + SUBCASE("gpu_forward_kernel") { // Run GPU Cast Forward Kernel GenericTensorAccessorR input_accessor_gpu = create_random_filled_accessor_r(input_shape, gpu_allocator); GenericTensorAccessorW output_accessor_gpu = create_zero_filled_accessor_w(output_shape, gpu_allocator); - Kernels::Reverse::forward_kernel(managed_stream.raw_stream(), - input_accessor_gpu, - output_accessor_gpu, - attrs); + Kernels::Reverse::gpu_forward_kernel(managed_stream.raw_stream(), + input_accessor_gpu, + output_accessor_gpu, + attrs); // Run CPU Cast Forward Kernel GenericTensorAccessorR input_accessor_cpu = @@ -95,7 +95,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { CHECK(accessors_are_equal(output_accessor_cpu, output_accessor_cpu)); } - SUBCASE("backward_kernel") { + SUBCASE("gpu_backward_kernel") { // Run GPU Cast Backward Kernel GenericTensorAccessorR output_grad_accessor_gpu = create_random_filled_accessor_r(output_shape, gpu_allocator); @@ -103,10 +103,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { GenericTensorAccessorW input_grad_accessor_gpu = create_zero_filled_accessor_w(input_shape, gpu_allocator); - Kernels::Reverse::backward_kernel(managed_stream.raw_stream(), - output_grad_accessor_gpu, - input_grad_accessor_gpu, - attrs); + Kernels::Reverse::gpu_backward_kernel(managed_stream.raw_stream(), + output_grad_accessor_gpu, + input_grad_accessor_gpu, + attrs); // Run CPU Cast Backward Kernel GenericTensorAccessorR output_grad_accessor_cpu = diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc index bb449f6755..ca94bf58d1 100644 --- a/lib/kernels/test/src/test_softmax_kernel.cc +++ b/lib/kernels/test/src/test_softmax_kernel.cc @@ -1,5 +1,5 @@ -#include "kernels/test_utils.h" -#include "kernels/softmax_kernels.h" +#include "internal/test_utils.h" +#include "kernels/softmax_kernels_gpu.h" #include using namespace ::FlexFlow; @@ -26,39 +26,40 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { TensorShape output_shape = input_shape; SoftmaxPerDeviceState state = - Kernels::Softmax::init_kernel(managed_handle.raw_handle(), - 0, - input_n.unwrap_nonnegative(), - channels.unwrap_nonnegative(), - input_h.unwrap_nonnegative(), - input_w.unwrap_nonnegative()); + Kernels::Softmax::gpu_init_kernel(managed_handle.raw_handle(), + ff_dim_t{3_n}, + input_n.unwrap_nonnegative(), + channels.unwrap_nonnegative(), + input_h.unwrap_nonnegative(), + input_w.unwrap_nonnegative()); GenericTensorAccessorW output_accessor = create_random_filled_accessor_w(output_shape, allocator); - SUBCASE("forward_kernel") { + SUBCASE("gpu_forward_kernel") { GenericTensorAccessorW input_accessor = create_random_filled_accessor_w(input_shape, allocator); - Kernels::Softmax::forward_kernel(managed_stream.raw_stream(), - state, - input_accessor.get_float_ptr(), - output_accessor.get_float_ptr()); + Kernels::Softmax::gpu_forward_kernel(managed_stream.raw_stream(), + state, + input_accessor.get_float_ptr(), + output_accessor.get_float_ptr()); CHECK(contains_non_zero(output_accessor)); } - SUBCASE("backward_kernel") { + SUBCASE("gpu_backward_kernel") { GenericTensorAccessorR output_grad_accessor = create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = allocator.allocate_tensor(input_shape); - Kernels::Softmax::backward_kernel( + Kernels::Softmax::gpu_backward_kernel( managed_stream.raw_stream(), output_grad_accessor.get_float_ptr(), input_grad_accessor.get_float_ptr(), - output_grad_accessor.shape.num_elements().int_from_positive_int()); + get_num_elements(output_grad_accessor.shape.dims) + .int_from_positive_int()); CHECK(contains_non_zero(input_grad_accessor)); } diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc index 2597db95e0..35866308ee 100644 --- a/lib/kernels/test/src/test_split_kernel.cc +++ b/lib/kernels/test/src/test_split_kernel.cc @@ -1,5 +1,5 @@ -#include "kernels/test_utils.h" -#include "kernels/split_kernels.h" +#include "internal/test_utils.h" +#include "kernels/split_kernels_gpu.h" #include "op-attrs/datatype_value.h" #include "utils/containers/repeat.h" #include @@ -9,9 +9,9 @@ using namespace ::FlexFlow; TEST_SUITE(FF_CUDA_TEST_SUITE) { TEST_CASE("Test Split Forward and Backward Kernel") { nonnegative_int num_outputs = 2_n; - coord_t out_blk_sizes[] = {50, 50}; - coord_t in_blk_size = 100; - coord_t num_blks = 1; + int out_blk_sizes[] = {50, 50}; + int in_blk_size = 100; + int num_blks = 1; ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( /*workSpaceSize=*/1024 * 1024, @@ -29,7 +29,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { DataType::FLOAT, }; - SUBCASE("forward_kernel") { + SUBCASE("gpu_forward_kernel") { GenericTensorAccessorW input_accessor = create_random_filled_accessor_w(input_shape, allocator); @@ -39,16 +39,16 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { return output_accessor.get_float_ptr(); }); - Kernels::Split::forward_kernel(managed_stream.raw_stream(), - output_ptrs.data(), - input_accessor.get_float_ptr(), - out_blk_sizes, - in_blk_size, - num_blks, - num_outputs.unwrap_nonnegative()); + Kernels::Split::gpu_forward_kernel(managed_stream.raw_stream(), + output_ptrs.data(), + input_accessor.get_float_ptr(), + out_blk_sizes, + in_blk_size, + num_blks, + num_outputs.unwrap_nonnegative()); } - SUBCASE("backward_kernel") { + SUBCASE("gpu_backward_kernel") { std::vector output_grad_ptrs(num_outputs.unwrap_nonnegative()); for (int i = 0; i < num_outputs; i++) { GenericTensorAccessorW output_grad_accessor = @@ -59,13 +59,14 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { GenericTensorAccessorW input_grad_accessor = create_filled_accessor_w( input_shape, allocator, make_float_data_type_value(0)); - Kernels::Split::backward_kernel(managed_stream.raw_stream(), - input_grad_accessor.get_float_ptr(), - (float const **)output_grad_ptrs.data(), - out_blk_sizes, - in_blk_size, - num_blks, - num_outputs.unwrap_nonnegative()); + Kernels::Split::gpu_backward_kernel( + managed_stream.raw_stream(), + input_grad_accessor.get_float_ptr(), + (float const **)output_grad_ptrs.data(), + out_blk_sizes, + in_blk_size, + num_blks, + num_outputs.unwrap_nonnegative()); } } } diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc index c0b2d4db5e..9d4809b2cf 100644 --- a/lib/kernels/test/src/test_transpose_kernel.cc +++ b/lib/kernels/test/src/test_transpose_kernel.cc @@ -1,5 +1,5 @@ -#include "kernels/test_utils.h" -#include "kernels/transpose_kernels.h" +#include "internal/test_utils.h" +#include "kernels/transpose_kernels_gpu.h" #include using namespace ::FlexFlow; @@ -25,28 +25,28 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { }; TensorShape output_shape = input_shape; - SUBCASE("forward_kernel") { + SUBCASE("gpu_forward_kernel") { GenericTensorAccessorR input_accessor = create_random_filled_accessor_r(input_shape, allocator); GenericTensorAccessorW output_accessor = allocator.allocate_tensor(output_shape); - Kernels::Transpose::forward_kernel( + Kernels::Transpose::gpu_forward_kernel( managed_stream.raw_stream(), attrs, input_accessor, output_accessor); CHECK(contains_non_zero(output_accessor)); } - SUBCASE("backward_kernel") { + SUBCASE("gpu_backward_kernel") { GenericTensorAccessorR output_grad_accessor = create_random_filled_accessor_r(output_shape, allocator); GenericTensorAccessorW input_grad_accessor = create_random_filled_accessor_w(input_shape, allocator); - Kernels::Transpose::backward_kernel(managed_stream.raw_stream(), - attrs, - output_grad_accessor, - input_grad_accessor); + Kernels::Transpose::gpu_backward_kernel(managed_stream.raw_stream(), + attrs, + output_grad_accessor, + input_grad_accessor); CHECK(contains_non_zero(input_grad_accessor)); } diff --git a/lib/local-execution/CMakeLists.txt b/lib/local-execution/CMakeLists.txt index db0cf7603f..b75f81fb3e 100644 --- a/lib/local-execution/CMakeLists.txt +++ b/lib/local-execution/CMakeLists.txt @@ -14,6 +14,7 @@ ff_add_library( task-spec pcg spdlog + compiler ) add_subdirectory(test) diff --git a/lib/local-execution/include/local-execution/allocated_tensors.h b/lib/local-execution/include/local-execution/allocated_tensors.h deleted file mode 100644 index f3face6ace..0000000000 --- a/lib/local-execution/include/local-execution/allocated_tensors.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_ALLOCATED_TENSORS_H -#define _FLEXFLOW_LOCAL_EXECUTION_ALLOCATED_TENSORS_H - -#include "local-execution/allocated_tensors.dtg.h" -#include "pcg/computation_graph.h" - -namespace FlexFlow { - -bool are_allocated_forward_tensors_valid( - AllocatedTensors const &, - std::unordered_map const &); -bool are_allocated_gradient_tensors_valid( - AllocatedTensors const &, - std::unordered_map const &); -bool are_allocated_optimizer_tensors_valid( - AllocatedTensors const &, - std::unordered_map const &); - -bool are_allocated_tensors_valid( - AllocatedTensors const &, - std::unordered_map const &); - -bool is_allocated_tensor_backing_valid( - TensorTypeVariant const &, - std::unordered_map const &, - ArrayShape const &); - -AllocatedTensors make_empty_allocated_tensors(); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/allocated_tensors.struct.toml b/lib/local-execution/include/local-execution/allocated_tensors.struct.toml deleted file mode 100644 index 33985b0d74..0000000000 --- a/lib/local-execution/include/local-execution/allocated_tensors.struct.toml +++ /dev/null @@ -1,30 +0,0 @@ -namespace = "FlexFlow" -name = "AllocatedTensors" -features = [ - "eq", - "fmt", -] - -includes = [ - "task-spec/tensor_type_t.dtg.h", - "kernels/accessor.h" -] - -src_includes = [ - "utils/hash/unordered_map.h", - "utils/fmt/unordered_map.h", - "utils/hash/vector.h", - "utils/fmt/vector.h" -] - -[[fields]] -name = "tensor_type_backings" -type = "std::unordered_map<::FlexFlow::TensorTypeVariant, ::FlexFlow::GenericTensorAccessorW>" - -[[fields]] -name = "gradient_mapping" -type = "std::unordered_map<::FlexFlow::tensor_guid_t, ::FlexFlow::gradient_tensor_t>" - -[[fields]] -name = "optimizer_mapping" -type = "std::unordered_map<::FlexFlow::tensor_guid_t, std::vector<::FlexFlow::optimizer_tensor_t>>" diff --git a/lib/local-execution/include/local-execution/cost_details.struct.toml b/lib/local-execution/include/local-execution/cost_details.struct.toml deleted file mode 100644 index d17438b9ff..0000000000 --- a/lib/local-execution/include/local-execution/cost_details.struct.toml +++ /dev/null @@ -1,18 +0,0 @@ -namespace = "FlexFlow" -name = "CostDetails" -features = [ - "eq", - "ord", - "hash", - "json", - "rapidcheck", - "fmt", -] - -[[fields]] -name = "total_elapsed_time" -type = "float" - -[[fields]] -name = "total_mem_usage" -type = "size_t" diff --git a/lib/local-execution/include/local-execution/cost_estimate.h b/lib/local-execution/include/local-execution/cost_estimate.h deleted file mode 100644 index 7020089ccf..0000000000 --- a/lib/local-execution/include/local-execution/cost_estimate.h +++ /dev/null @@ -1,63 +0,0 @@ - -#ifndef _FLEXFLOW_LOCAL_EXECUTION_COST_ESTIMATE_H -#define _FLEXFLOW_LOCAL_EXECUTION_COST_ESTIMATE_H - -#include "local-execution/cost_details.dtg.h" -#include "local-execution/local_training_backing.h" -#include "op-attrs/parallel_tensor_shape.dtg.h" -#include "op-attrs/pcg_operator_attrs.dtg.h" -#include "pcg/machine_view.h" -#include "pcg/parallel_computation_graph/parallel_tensor_attrs.dtg.h" -namespace FlexFlow { - -struct ICostEstimator { - virtual CostDetails - estimate_cost(PCGOperatorAttrs const &op, - std::vector const &inputs, - std::vector const &weights, - std::vector const &outputs, - MachineView const &mv) const = 0; - virtual float estimate_cost(ParallelTensorShape const &tensor_shape, - MachineView const &src, - MachineView const &dst) const = 0; - - ICostEstimator() = default; - ICostEstimator(ICostEstimator const &) = delete; - ICostEstimator &operator=(ICostEstimator const &) = delete; - - virtual ~ICostEstimator() = default; -}; -CHECK_RC_COPY_VIRTUAL_COMPLIANT(ICostEstimator); - -struct CostEstimator { - CostDetails estimate_cost(PCGOperatorAttrs const &op, - std::vector const &inputs, - std::vector const &weights, - std::vector const &outputs, - MachineView const &mv) const { - return this->implementation_ptr->estimate_cost( - op, inputs, weights, outputs, mv); - } - - float estimate_cost(ParallelTensorShape const &tensor_shape, - MachineView const &src, - MachineView const &dst) const { - return this->implementation_ptr->estimate_cost(tensor_shape, src, dst); - } - - template - static typename std::enable_if::value, - CostEstimator>::type - create(Args &&...args) { - return CostEstimator(std::make_shared(std::forward(args)...)); - } - -private: - CostEstimator(std::shared_ptr implementation_ptr) - : implementation_ptr(implementation_ptr) {} - std::shared_ptr implementation_ptr; -}; - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/cost_metrics.h b/lib/local-execution/include/local-execution/cost_metrics.h deleted file mode 100644 index edc0190daf..0000000000 --- a/lib/local-execution/include/local-execution/cost_metrics.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_COST_METRICS_H -#define _FLEXFLOW_LOCAL_EXECUTION_COST_METRICS_H - -#include "utils/visitable.h" - -namespace FlexFlow { - -/** - * @brief Costs of an operator. - */ -struct CostMetrics : public use_visitable_cmp { - CostMetrics() = delete; - CostMetrics(float forward_time, - float backward_time, - float sync_type, - size_t inputs_memory, - size_t outputs_memory, - size_t weights_memory); - /** - * @brief Return the sum of inputs_memory, outputs_memory, and weights_memory - * recorded in this CostMetrics. - */ - size_t total_memory() const; - - /** - * @brief Return the sum of memory recorded in this CostMetrics, but in MB, - * instead of Bytes. - */ - float total_memory_in_mb() const; - - /** - * @brief Get the incremental difference between the total memory in - * CostMetrics and sim->offset. - * @details This is to easily compute the difference between sim->offset and - * sum of all memory usage recorded in this CostMetrics. - * - * @param sim_offset Simulator->offset - * @return size_t The incremental memory usage difference - */ - size_t total_mem_diff_from(off_t sim_offset) const; - -public: - float forward_time; - float backward_time; - float sync_time; - ///< Bytes of memory usage of different parts - // Assume: - // 1. all memory allocations use Simulator::allocate - // 2. we call Simulator::free_all before measuring an operator - // Therefore, the current memory usage of an operator is (size_t)sim->offset - size_t inputs_memory; - size_t outputs_memory; - size_t weights_memory; - ///< Memory usage of Op* considering parallelization over devices - size_t op_total_mem; -}; - -} // namespace FlexFlow - -VISITABLE_STRUCT(::FlexFlow::CostMetrics, - forward_time, - backward_time, - sync_time, - inputs_memory, - outputs_memory, - weights_memory, - op_total_mem); -MAKE_VISIT_HASHABLE(::FlexFlow::CostMetrics); - -#endif diff --git a/lib/local-execution/include/local-execution/gradient_tensor_source.h b/lib/local-execution/include/local-execution/gradient_tensor_source.h deleted file mode 100644 index d724859712..0000000000 --- a/lib/local-execution/include/local-execution/gradient_tensor_source.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_GRADIENT_TENSOR_SOURCE_H -#define _FLEXFLOW_LOCAL_EXECUTION_GRADIENT_TENSOR_SOURCE_H - -#include "task-spec/gradient_tensor_t.dtg.h" - -namespace FlexFlow { - -struct GradientTensorSource { -public: - GradientTensorSource(); - - gradient_tensor_t new_gradient_tensor(); - - void reset(); - -private: - static size_t next_available_gradient_tensor_id; -}; - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/local_args_backing.h b/lib/local-execution/include/local-execution/local_args_backing.h index e9044dc6fa..1e9a30d293 100644 --- a/lib/local-execution/include/local-execution/local_args_backing.h +++ b/lib/local-execution/include/local-execution/local_args_backing.h @@ -1,37 +1,37 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_ARGS_BACKING_H #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_ARGS_BACKING_H +#include "local-execution/local_args_backing.dtg.h" #include "local-execution/local_task_argument_accessor.h" +#include "local-execution/local_task_registry.dtg.h" +#include "local-execution/local_tensor_backing.dtg.h" #include "pcg/computation_graph.h" -#include "pcg/layer_guid_t.dtg.h" -#include "task-spec/op_task_invocation.h" #include "task-spec/per_device_op_state.h" -#include "task-spec/runtime_arg_config.h" +#include "task-spec/task_binding.h" #include "task-spec/task_invocation.dtg.h" +#include "task-spec/training_computation_graph.dtg.h" +#include "task-spec/training_layer_plus_context.dtg.h" namespace FlexFlow { -struct LocalArgsBacking { - LocalArgsBacking( - RuntimeArgConfig const &, - std::unordered_map const &); - -public: - // arguments - RuntimeArgConfig runtime_arg_config; - std::unordered_map - per_device_op_states; -}; - -LocalArgsBacking - make_args_backing_with_empty_device_states(RuntimeArgConfig const &); +LocalArgsBacking make_local_computation_args_backing_with_empty_device_states( + RuntimeArgConfig const &); std::optional get_per_device_op_state_if_exists(LocalArgsBacking const &, layer_guid_t const &); -ArgSlotsBacking construct_arg_slots_backing(TaskBinding const &, - RuntimeArgConfig const &); +std::unordered_map + construct_arg_slots_backing(TaskBinding const &, RuntimeArgConfig const &); + +TaskArgumentAccessor get_task_arg_accessor(LocalTensorBacking const &, + RuntimeArgConfig const &, + TaskInvocation const &, + Allocator &); + +LocalArgsBacking make_local_args_backing_for_computation_graph( + RuntimeArgConfig const &, + std::unordered_map> const &); } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/local_args_backing.struct.toml b/lib/local-execution/include/local-execution/local_args_backing.struct.toml new file mode 100644 index 0000000000..449f883194 --- /dev/null +++ b/lib/local-execution/include/local-execution/local_args_backing.struct.toml @@ -0,0 +1,18 @@ +namespace = "FlexFlow" +name = "LocalArgsBacking" +features = [] + +includes = [ + "task-spec/runtime_arg_config.dtg.h", + "task-spec/device_specific_device_states.dtg.h", + "pcg/layer_guid_t.dtg.h", + "", +] + +[[fields]] +name = "runtime_arg_config" +type = "::FlexFlow::RuntimeArgConfig" + +[[fields]] +name = "per_device_op_states" +type = "std::unordered_map<::FlexFlow::layer_guid_t, std::optional<::FlexFlow::DeviceSpecificDeviceStates>>" diff --git a/lib/local-execution/include/local-execution/local_cost_estimator.h b/lib/local-execution/include/local-execution/local_cost_estimator.h index 0189475fcb..c42876bbd6 100644 --- a/lib/local-execution/include/local-execution/local_cost_estimator.h +++ b/lib/local-execution/include/local-execution/local_cost_estimator.h @@ -1,26 +1,22 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_COST_ESTIMATOR_H #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_COST_ESTIMATOR_H -#include "local-execution/cost_estimate.h" -#include "task-spec/runtime_arg_config.h" +#include "compiler/cost_estimator/cost_estimator.h" +#include "pcg/optimizer_attrs.dtg.h" +#include "task-spec/runtime_arg_config.dtg.h" namespace FlexFlow { struct LocalCostEstimator : public ICostEstimator { LocalCostEstimator(RuntimeArgConfig const &); + LocalCostEstimator(LocalCostEstimator const &) = delete; LocalCostEstimator(LocalCostEstimator &&) = delete; ~LocalCostEstimator() = default; - CostDetails estimate_cost(PCGOperatorAttrs const &op, - std::vector const &inputs, - std::vector const &weights, - std::vector const &outputs, - MachineView const &mv) const override; + OpCostMetrics estimate_cost(OpCostEstimateKey const &) const override; - float estimate_cost(ParallelTensorShape const &tensor_shape, - MachineView const &src, - MachineView const &dst) const override; + milliseconds_t estimate_cost(TensorSetMovement const &) const override; private: RuntimeArgConfig runtime_arg_config; diff --git a/lib/local-execution/include/local-execution/local_task_argument_accessor.h b/lib/local-execution/include/local-execution/local_task_argument_accessor.h index 184bf0b559..0ab66234eb 100644 --- a/lib/local-execution/include/local-execution/local_task_argument_accessor.h +++ b/lib/local-execution/include/local-execution/local_task_argument_accessor.h @@ -1,22 +1,21 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TASK_ARGUMENT_ACCESSOR_H -#include "task-spec/slot_tensor_type_id.dtg.h" +#include "local-execution/tensor_slot_backing.dtg.h" +#include "task-spec/runtime_arg_config.dtg.h" #include "task-spec/task_argument_accessor.h" +#include "task-spec/tensor_sub_slot_id_t.dtg.h" #include #include namespace FlexFlow { -using TensorSlotsBacking = std::unordered_map< - SlotTensorTypeId, - std::variant>>; -using ArgSlotsBacking = std::unordered_map; - struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor { - LocalTaskArgumentAccessor(Allocator const &allocator, - TensorSlotsBacking const &tensor_slots_backing, - ArgSlotsBacking const &arg_slots_backing); + explicit LocalTaskArgumentAccessor( + Allocator const &allocator, + std::unordered_map const + &tensor_slots_backing, + std::unordered_map const &arg_slots_backing); LocalTaskArgumentAccessor(LocalTaskArgumentAccessor const &) = delete; LocalTaskArgumentAccessor(LocalTaskArgumentAccessor &&) = delete; @@ -35,8 +34,9 @@ struct LocalTaskArgumentAccessor : public ITaskArgumentAccessor { private: Allocator allocator; - TensorSlotsBacking tensor_slots_backing; - ArgSlotsBacking arg_slots_backing; + std::unordered_map + tensor_slots_backing; + std::unordered_map arg_slots_backing; }; CHECK_RC_COPY_VIRTUAL_COMPLIANT(LocalTaskArgumentAccessor); diff --git a/lib/local-execution/include/local-execution/local_task_registry.h b/lib/local-execution/include/local-execution/local_task_registry.h new file mode 100644 index 0000000000..142433ba53 --- /dev/null +++ b/lib/local-execution/include/local-execution/local_task_registry.h @@ -0,0 +1,24 @@ +#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_REGISTRY_H +#define _FLEXFLOW_LOCAL_EXECUTION_TASK_REGISTRY_H + +#include "local-execution/local_task_registry.dtg.h" +#include "local-execution/registered_task_t.dtg.h" +#include "pcg/layer_attrs.dtg.h" +#include "task-spec/op_task_type.dtg.h" +#include "utils/units/milliseconds_t.h" + +namespace FlexFlow { + +LocalTaskRegistry construct_local_task_registry_for_layers( + std::unordered_map const &); + +std::optional try_get_registered_task( + LocalTaskRegistry const &, layer_guid_t const &, OpTaskType const &); + +std::optional call_task_impl(LocalTaskRegistry const &, + task_id_t const &task_id, + TaskArgumentAccessor const &acc); + +} // namespace FlexFlow + +#endif diff --git a/lib/local-execution/include/local-execution/local_task_registry.struct.toml b/lib/local-execution/include/local-execution/local_task_registry.struct.toml new file mode 100644 index 0000000000..84abc7aa0c --- /dev/null +++ b/lib/local-execution/include/local-execution/local_task_registry.struct.toml @@ -0,0 +1,26 @@ +namespace = "FlexFlow" +name = "LocalTaskRegistry" +features = [ + "eq", + "fmt", + "hash" +] + +includes = [ + "task-spec/task_signature_impl.dtg.h", + "pcg/layer_guid_t.dtg.h", + "local-execution/operator_task_set.dtg.h" +] + +src_includes = [ + "utils/hash/unordered_map.h", + "utils/fmt/unordered_map.h", +] + +[[fields]] +name = "task_sets" +type = "std::unordered_map<::FlexFlow::layer_guid_t, ::FlexFlow::OperatorTaskSet>" + +[[fields]] +name = "task_mapping" +type = "std::unordered_map<::FlexFlow::task_id_t, ::FlexFlow::TaskSignatureAndImpl>" diff --git a/lib/local-execution/include/local-execution/local_tensor_backing.h b/lib/local-execution/include/local-execution/local_tensor_backing.h index f6168f2fb1..479ad4734a 100644 --- a/lib/local-execution/include/local-execution/local_tensor_backing.h +++ b/lib/local-execution/include/local-execution/local_tensor_backing.h @@ -1,46 +1,30 @@ - #ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TENSOR_BACKING_H #define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TENSOR_BACKING_H #include "kernels/accessor.h" -#include "local-execution/allocated_tensors.dtg.h" -#include "local-execution/gradient_tensor_source.h" -#include "local-execution/local_task_argument_accessor.h" +#include "kernels/allocation.h" #include "local-execution/local_tensor_backing.dtg.h" -#include "local-execution/loss_tensor_source.h" -#include "local-execution/optimizer_tensor_source.h" -#include "local-execution/unallocated_tensors.dtg.h" -#include "pcg/computation_graph.dtg.h" -#include "pcg/layer_guid_t.dtg.h" -#include "pcg/optimizer_attrs.dtg.h" -#include "task-spec/lowered_tensor_t.dtg.h" -#include "task-spec/task_invocation.dtg.h" -#include "task-spec/tensor_role.dtg.h" +#include "local-execution/tensor_slot_backing.dtg.h" +#include "task-spec/task_binding.h" +#include "task-spec/training_computation_graph.dtg.h" +#include "task-spec/training_tensor_guid_t.dtg.h" namespace FlexFlow { -GenericTensorAccessorW get_tensor(LocalTensorBacking const &, - TensorTypeVariant const &); - -std::unordered_map - get_tensor_backings( - std::unordered_map const &, - std::unordered_map const &, - Allocator &); - -std::unordered_map> - merge_optimizer_mappings( - std::unordered_map> const - &allocated, - std::unordered_map> const - &unallocated); +LocalTensorBacking construct_local_tensor_backing( + std::unordered_map const + &training_tensor_shapes, + std::unordered_map const + &preallocated_tensors, + Allocator &); -LocalTensorBacking construct_local_tensor_backing(AllocatedTensors const &, - UnallocatedTensors const &, - Allocator &); +GenericTensorAccessorW + get_accessor_for_training_tensor(LocalTensorBacking const &, + training_tensor_guid_t); -TensorSlotsBacking construct_tensor_slots_backing(LocalTensorBacking const &, - TaskBinding const &); +std::unordered_map + construct_tensor_slots_backing_for_binding(LocalTensorBacking const &, + TaskBinding const &); } // namespace FlexFlow diff --git a/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml b/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml index bd59ec325d..48a7a7fa90 100644 --- a/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml +++ b/lib/local-execution/include/local-execution/local_tensor_backing.struct.toml @@ -6,26 +6,14 @@ features = [ ] includes = [ - "task-spec/tensor_type_t.dtg.h", "kernels/accessor.h", - "pcg/tensor_guid_t.dtg.h", - "task-spec/gradient_tensor_t.dtg.h", - "task-spec/optimizer_tensor_t.dtg.h", + "task-spec/training_tensor_guid_t.dtg.h", ] src_includes = [ "utils/fmt/unordered_map.h", - "utils/fmt/vector.h", ] [[fields]] -name = "tensor_backings" -type = "std::unordered_map<::FlexFlow::TensorTypeVariant, ::FlexFlow::GenericTensorAccessorW>" - -[[fields]] -name = "tensor_gradient_mapping" -type = "std::unordered_map<::FlexFlow::tensor_guid_t, ::FlexFlow::gradient_tensor_t>" - -[[fields]] -name = "tensor_optimizer_mapping" -type = "std::unordered_map<::FlexFlow::tensor_guid_t, std::vector<::FlexFlow::optimizer_tensor_t>>" +name = "backing_for_training_tensor_map" +type = "std::unordered_map<::FlexFlow::training_tensor_guid_t, ::FlexFlow::GenericTensorAccessorW>" diff --git a/lib/local-execution/include/local-execution/local_training_backing.h b/lib/local-execution/include/local-execution/local_training_backing.h index addac74633..5484adef75 100644 --- a/lib/local-execution/include/local-execution/local_training_backing.h +++ b/lib/local-execution/include/local-execution/local_training_backing.h @@ -1,70 +1,50 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TRAINING_BACKING_H -#define _FLEXFLOW_LOCAL_EXECUTION_LOCAL_TRAINING_BACKING_H +#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOCAL_TRAINING_BACKING_H +#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOCAL_TRAINING_BACKING_H -#include "local-execution/allocated_tensors.dtg.h" -#include "local-execution/local_args_backing.h" -#include "local-execution/local_tensor_backing.h" -#include "local-execution/optimizer_tensor_source.h" -#include "local-execution/task_registry.h" +#include "local-execution/local_training_backing.dtg.h" #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" -#include "pcg/computation_graph.dtg.h" #include "pcg/optimizer_attrs.dtg.h" +#include "task-spec/training_computation_graph.dtg.h" +#include "task-spec/training_tensor_guid_t.dtg.h" +#include "utils/containers/generate_map.h" +#include "utils/units/milliseconds_t.h" namespace FlexFlow { -struct LocalTrainingBacking { - LocalTrainingBacking(Allocator &, - AllocatedTensors const &, - GradientTensorSource &, - ComputationGraph const &, - RuntimeArgConfig const &); +LocalTrainingBacking make_local_training_backing_for_computation_graph( + Allocator &allocator, + std::unordered_map const + &preallocated_tensors, + TrainingComputationGraph const &training_computation_graph, + RuntimeArgConfig const &runtime_arg_config, + OptimizerAttrs const &optimizer_attrs); + +std::optional + create_per_device_op_state(LocalTaskRegistry const &, + LocalTensorBacking const &, + RuntimeArgConfig const &, + Allocator &, + TrainingLayerPlusContext const &); + +std::optional execute_forward(LocalTaskRegistry const &, + LocalTensorBacking const &, + LocalArgsBacking const &, + TrainingLayerPlusContext const &, + Allocator &); + +std::optional execute_backward(LocalTaskRegistry const &, + LocalTensorBacking const &, + LocalArgsBacking const &, + TrainingLayerPlusContext const &, + Allocator &); + +void compute_loss(LocalTrainingBacking const &, LossAttrs const &, Allocator &); - LocalTrainingBacking(Allocator &, - AllocatedTensors const &, - GradientTensorSource &, - OptimizerTensorSource &, - ComputationGraph const &, - RuntimeArgConfig const &, - OptimizerAttrs const &); - -public: - ComputationGraph computation_graph; - TaskRegistry task_registry; - LocalTensorBacking local_tensor_backing; - LocalArgsBacking local_args_backing; -}; - -LocalArgsBacking initialize_args_backing(TaskRegistry const &, - ComputationGraph const &, - RuntimeArgConfig const &, - LocalTensorBacking const &, - Allocator &); - -std::optional call_task_impl(TaskRegistry const &, - task_id_t const &task_id, - TaskArgumentAccessor const &acc); - -std::optional execute_forward(LocalTrainingBacking const &, - layer_guid_t const &, - Allocator &); -std::optional execute_backward(LocalTrainingBacking const &, - layer_guid_t const &, - Allocator &); -void compute_loss(LocalTrainingBacking const &, - LossAttrs const &, - tensor_guid_t const &logit_tensor, - loss_tensor_t const &label_tensor, - Allocator &); void execute_update(LocalTrainingBacking const &, layer_guid_t const &, OptimizerAttrs const &, Allocator &); -TaskArgumentAccessor get_task_arg_accessor(LocalTensorBacking const &, - LocalArgsBacking const &, - TaskInvocation const &, - Allocator &); - } // namespace FlexFlow #endif diff --git a/lib/local-execution/include/local-execution/local_training_backing.struct.toml b/lib/local-execution/include/local-execution/local_training_backing.struct.toml new file mode 100644 index 0000000000..7da8c3bed6 --- /dev/null +++ b/lib/local-execution/include/local-execution/local_training_backing.struct.toml @@ -0,0 +1,26 @@ +namespace = "FlexFlow" +name = "LocalTrainingBacking" +features = [] + +includes = [ + "task-spec/training_computation_graph.dtg.h", + "local-execution/local_task_registry.h", + "local-execution/local_tensor_backing.h", + "local-execution/local_args_backing.h", +] + +[[fields]] +name = "training_computation_graph" +type = "::FlexFlow::TrainingComputationGraph" + +[[fields]] +name = "local_task_registry" +type = "::FlexFlow::LocalTaskRegistry" + +[[fields]] +name = "local_tensor_backing" +type = "::FlexFlow::LocalTensorBacking" + +[[fields]] +name = "local_args_backing" +type = "::FlexFlow::LocalArgsBacking" diff --git a/lib/local-execution/include/local-execution/model_training_instance.h b/lib/local-execution/include/local-execution/model_training_instance.h index 6f8f4b1543..bfd279fde5 100644 --- a/lib/local-execution/include/local-execution/model_training_instance.h +++ b/lib/local-execution/include/local-execution/model_training_instance.h @@ -4,31 +4,24 @@ #include "local-execution/local_training_backing.h" #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" #include "pcg/tensor_guid_t.dtg.h" -#include "task-spec/loss_tensor_t.dtg.h" +#include "task-spec/loss_tensor_guid_t.dtg.h" namespace FlexFlow { -using PerLayerElapsedTime = - std::unordered_map>; - struct ModelTrainingInstance { ModelTrainingInstance(Allocator const &, LocalTrainingBacking const &, - tensor_guid_t const &logit_tensor, - loss_tensor_t const &label_tensor, LossAttrs const &, OptimizerAttrs const &); Allocator allocator; LocalTrainingBacking training_backing; - tensor_guid_t logit_tensor; - loss_tensor_t label_tensor; LossAttrs loss_attrs; OptimizerAttrs optimizer_attrs; public: - PerLayerElapsedTime forward(); - PerLayerElapsedTime backward(); + std::unordered_map> forward(); + std::unordered_map> backward(); void update(); GenericTensorAccessorR get_loss_tensor_accessor() const; }; diff --git a/lib/local-execution/include/local-execution/operator_task_set.h b/lib/local-execution/include/local-execution/operator_task_set.h new file mode 100644 index 0000000000..bbe9da5d7f --- /dev/null +++ b/lib/local-execution/include/local-execution/operator_task_set.h @@ -0,0 +1,24 @@ +#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPERATOR_TASK_SET_H +#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPERATOR_TASK_SET_H + +#include "local-execution/operator_task_set.dtg.h" +#include "op-attrs/computation_graph_op_attrs.dtg.h" +#include "task-spec/op_task_type.dtg.h" +#include "utils/bidict/bidict.h" + +namespace FlexFlow { + +bidict + get_map_from_task_type_to_task(OperatorTaskSet const &); +std::unordered_set + get_all_tasks_in_task_set(OperatorTaskSet const &); + +registered_task_t get_task_for_task_type(OperatorTaskSet const &op_task_set, + OpTaskType task_type); + +OperatorTaskSet + get_task_set_for_operator(ComputationGraphOpAttrs const &op_attrs); + +} // namespace FlexFlow + +#endif diff --git a/lib/local-execution/include/local-execution/operator_task_set.struct.toml b/lib/local-execution/include/local-execution/operator_task_set.struct.toml new file mode 100644 index 0000000000..dda2a1478d --- /dev/null +++ b/lib/local-execution/include/local-execution/operator_task_set.struct.toml @@ -0,0 +1,24 @@ +namespace = "FlexFlow" +name = "OperatorTaskSet" +features = [ + "eq", + "ord", + "hash", + "fmt", +] + +includes = [ + "local-execution/registered_task_t.dtg.h" +] + +[[fields]] +name = "init_task" +type = "::FlexFlow::registered_task_t" + +[[fields]] +name = "fwd_task" +type = "::FlexFlow::registered_task_t" + +[[fields]] +name = "bwd_task" +type = "::FlexFlow::registered_task_t" diff --git a/lib/local-execution/include/local-execution/optimizer_tensor_source.h b/lib/local-execution/include/local-execution/optimizer_tensor_source.h deleted file mode 100644 index b2b3d94ba5..0000000000 --- a/lib/local-execution/include/local-execution/optimizer_tensor_source.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_OPTIMIZER_TENSOR_SOURCE_H -#define _FLEXFLOW_LOCAL_EXECUTION_OPTIMIZER_TENSOR_SOURCE_H - -#include "task-spec/optimizer_tensor_t.dtg.h" - -namespace FlexFlow { - -struct OptimizerTensorSource { -public: - OptimizerTensorSource(); - - optimizer_tensor_t new_optimizer_tensor(); - - void reset(); - -private: - static size_t next_available_optimizer_tensor_id; -}; - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/registered_task.h b/lib/local-execution/include/local-execution/registered_task.h new file mode 100644 index 0000000000..d6e8a87b18 --- /dev/null +++ b/lib/local-execution/include/local-execution/registered_task.h @@ -0,0 +1,12 @@ +#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_REGISTERED_TASK_H +#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_REGISTERED_TASK_H + +#include "local-execution/registered_task_t.dtg.h" + +namespace FlexFlow { + +registered_task_t make_noop_registered_task(); + +} // namespace FlexFlow + +#endif diff --git a/lib/local-execution/include/local-execution/registered_task_t.variant.toml b/lib/local-execution/include/local-execution/registered_task_t.variant.toml new file mode 100644 index 0000000000..d4bab60ec9 --- /dev/null +++ b/lib/local-execution/include/local-execution/registered_task_t.variant.toml @@ -0,0 +1,27 @@ +namespace = "FlexFlow" +name = "registered_task_t" +features = [ + "eq", + "ord", + "hash", + "fmt", + "rapidcheck", +] + +includes = [ + "task-spec/task_id_t.dtg.h", + "", +] + +src_includes = [ + "utils/rapidcheck/monostate.h", + "utils/fmt/monostate.h", +] + +[[values]] +type = "::FlexFlow::task_id_t" +key = "real_task" + +[[values]] +type = "std::monostate" +key = "noop_task" diff --git a/lib/local-execution/include/local-execution/task_registry.h b/lib/local-execution/include/local-execution/task_registry.h deleted file mode 100644 index eb3e0859d0..0000000000 --- a/lib/local-execution/include/local-execution/task_registry.h +++ /dev/null @@ -1,21 +0,0 @@ - -#ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_REGISTRY_H -#define _FLEXFLOW_LOCAL_EXECUTION_TASK_REGISTRY_H - -#include "local-execution/task_registry.dtg.h" -#include "op-attrs/computation_graph_op_attrs.h" -#include "pcg/computation_graph.dtg.h" -#include "task-spec/op_task_type.dtg.h" - -namespace FlexFlow { - -TaskRegistry construct_task_registry( - std::unordered_map const &); - -bool registry_contains_task_for_layer(TaskRegistry const &, - layer_guid_t const &, - OpTaskType const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/task_registry.struct.toml b/lib/local-execution/include/local-execution/task_registry.struct.toml deleted file mode 100644 index f5daa62090..0000000000 --- a/lib/local-execution/include/local-execution/task_registry.struct.toml +++ /dev/null @@ -1,35 +0,0 @@ -namespace = "FlexFlow" -name = "TaskRegistry" -features = [ - "eq", - "fmt", - "hash" -] - -includes = [ - "task-spec/task_signature_impl.dtg.h", - "task-spec/task_id_t.dtg.h", - "pcg/layer_guid_t.dtg.h", -] - -src_includes = [ - "utils/hash/unordered_map.h", - "utils/fmt/unordered_map.h", - "utils/fmt/optional.h", -] - -[[fields]] -name = "init_task_ids" -type = "std::unordered_map<::FlexFlow::layer_guid_t, std::optional<::FlexFlow::task_id_t>>" - -[[fields]] -name = "forward_task_ids" -type = "std::unordered_map<::FlexFlow::layer_guid_t, std::optional<::FlexFlow::task_id_t>>" - -[[fields]] -name = "backward_task_ids" -type = "std::unordered_map<::FlexFlow::layer_guid_t, std::optional<::FlexFlow::task_id_t>>" - -[[fields]] -name = "task_mapping" -type = "std::unordered_map<::FlexFlow::task_id_t, ::FlexFlow::TaskSignatureAndImpl>" diff --git a/lib/local-execution/include/local-execution/tensor_slot_backing.variant.toml b/lib/local-execution/include/local-execution/tensor_slot_backing.variant.toml new file mode 100644 index 0000000000..434988fa21 --- /dev/null +++ b/lib/local-execution/include/local-execution/tensor_slot_backing.variant.toml @@ -0,0 +1,23 @@ +namespace = "FlexFlow" +name = "TensorSlotBacking" +features = [ + "eq", + "fmt", +] + +includes = [ + "kernels/accessor.h", + "", +] + +src_includes = [ + "utils/fmt/vector.h", +] + +[[values]] +type = "::FlexFlow::GenericTensorAccessorW" +key = "single" + +[[values]] +type = "std::vector<::FlexFlow::GenericTensorAccessorW>" +key = "variadic" diff --git a/lib/local-execution/include/local-execution/tracked_allocator.h b/lib/local-execution/include/local-execution/tracked_allocator.h index f697337c52..0b531f9b3d 100644 --- a/lib/local-execution/include/local-execution/tracked_allocator.h +++ b/lib/local-execution/include/local-execution/tracked_allocator.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_LOCAL_EXECUTION_TRACKED_ALLOCATOR_H #include "kernels/allocation.h" +#include "utils/units/num_bytes_t.h" namespace FlexFlow { @@ -16,7 +17,7 @@ struct TrackedAllocator : public IAllocator { DeviceType get_allocation_device_type() const override; - size_t get_current_mem_usage(); + num_bytes_t get_current_mem_usage() const; private: size_t current_mem_usage = 0; diff --git a/lib/local-execution/include/local-execution/unallocated_tensors.h b/lib/local-execution/include/local-execution/unallocated_tensors.h deleted file mode 100644 index 63ead67589..0000000000 --- a/lib/local-execution/include/local-execution/unallocated_tensors.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_UNALLOCATED_TENSORS_H -#define _FLEXFLOW_LOCAL_EXECUTION_UNALLOCATED_TENSORS_H - -#include "local-execution/allocated_tensors.dtg.h" -#include "local-execution/gradient_tensor_source.h" -#include "local-execution/optimizer_tensor_source.h" -#include "local-execution/unallocated_tensors.dtg.h" -#include "pcg/optimizer_attrs.dtg.h" -#include "pcg/tensor_attrs.dtg.h" - -namespace FlexFlow { - -UnallocatedTensors generate_unallocated_tensors( - AllocatedTensors const &, - std::unordered_map const &, - GradientTensorSource &); - -UnallocatedTensors generate_unallocated_tensors_with_optimizer( - AllocatedTensors const &, - std::unordered_map const &, - GradientTensorSource &, - OptimizerTensorSource &, - OptimizerAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/local-execution/include/local-execution/unallocated_tensors.struct.toml b/lib/local-execution/include/local-execution/unallocated_tensors.struct.toml deleted file mode 100644 index e86cc2a532..0000000000 --- a/lib/local-execution/include/local-execution/unallocated_tensors.struct.toml +++ /dev/null @@ -1,31 +0,0 @@ -namespace = "FlexFlow" -name = "UnallocatedTensors" -features = [ - "eq", - "fmt", - "hash", -] - -includes = [ - "task-spec/tensor_type_t.dtg.h", - "op-attrs/tensor_shape.dtg.h" -] - -src_includes = [ - "utils/hash/unordered_map.h", - "utils/fmt/unordered_map.h", - "utils/hash/vector.h", - "utils/fmt/vector.h" -] - -[[fields]] -name = "tensor_type_shapes" -type = "std::unordered_map<::FlexFlow::TensorTypeVariant, ::FlexFlow::TensorShape>" - -[[fields]] -name = "gradient_mapping" -type = "std::unordered_map<::FlexFlow::tensor_guid_t, ::FlexFlow::gradient_tensor_t>" - -[[fields]] -name = "optimizer_mapping" -type = "std::unordered_map<::FlexFlow::tensor_guid_t, std::vector<::FlexFlow::optimizer_tensor_t>>" diff --git a/lib/local-execution/src/allocated_tensors.cc b/lib/local-execution/src/allocated_tensors.cc deleted file mode 100644 index ffaeaf285f..0000000000 --- a/lib/local-execution/src/allocated_tensors.cc +++ /dev/null @@ -1,145 +0,0 @@ -#include "local-execution/allocated_tensors.h" -#include "pcg/optimizer_attrs.h" -#include "utils/containers/keys.h" -#include "utils/containers/set_union.h" - -namespace FlexFlow { - -bool is_allocated_tensor_backing_valid( - TensorTypeVariant const &tensor_type, - std::unordered_map const - &allocated_tensor_backings, - ArrayShape const &expected_shape) { - if (allocated_tensor_backings.count(tensor_type)) { - GenericTensorAccessorW tensor_backing = - allocated_tensor_backings.at(tensor_type); - if (expected_shape == tensor_backing.shape) { - return true; - } - } - return false; -}; - -bool are_allocated_forward_tensors_valid( - AllocatedTensors const &allocated_tensors, - std::unordered_map const &tensor_attrs) { - - std::unordered_set all_tensor_guids = transform( - keys(filter_keys( - allocated_tensors.tensor_type_backings, - [&](TensorTypeVariant const &k) { return k.has(); })), - [&](TensorTypeVariant const &t) { return t.get(); }); - - for (tensor_guid_t const &tensor_guid : all_tensor_guids) { - if (tensor_attrs.count(tensor_guid)) { - if (!is_allocated_tensor_backing_valid( - TensorTypeVariant{tensor_guid}, - allocated_tensors.tensor_type_backings, - array_shape_from_tensor_shape( - tensor_attrs.at(tensor_guid).shape))) { - return false; - } - } else { - return false; - } - } - return true; -} - -bool are_allocated_gradient_tensors_valid( - AllocatedTensors const &allocated_tensors, - std::unordered_map const &tensor_attrs) { - std::unordered_set - tensors_in_mappings; // will check for dangling gradient tensors - - for (std::pair const &tensor_to_grad : - allocated_tensors.gradient_mapping) { - if (tensor_attrs.count(tensor_to_grad.first)) { - if (tensor_attrs.at(tensor_to_grad.first).create_grad == CreateGrad::NO) { - return false; - } - - ArrayShape tensor_guid_array_shape = array_shape_from_tensor_shape( - tensor_attrs.at(tensor_to_grad.first).shape); - TensorTypeVariant gradient_tensor = - TensorTypeVariant{tensor_to_grad.second}; - if (is_allocated_tensor_backing_valid( - gradient_tensor, - allocated_tensors.tensor_type_backings, - tensor_guid_array_shape)) { - tensors_in_mappings.insert(gradient_tensor); - } else { - return false; - } - } else { - return false; - } - } - - for (TensorTypeVariant const &tensor_type : - keys(allocated_tensors.tensor_type_backings)) { - if (tensor_type.has()) { - if (!tensors_in_mappings.count(tensor_type)) { - return false; - } - } - } - return true; -} - -bool are_allocated_optimizer_tensors_valid( - AllocatedTensors const &allocated_tensors, - std::unordered_map const &tensor_attrs) { - std::unordered_set - tensors_in_mappings; // will check for dangling optimizer tensors - - for (std::pair> const - &tensor_to_optimizers : allocated_tensors.optimizer_mapping) { - if (tensor_attrs.count(tensor_to_optimizers.first)) { - if (tensor_attrs.at(tensor_to_optimizers.first).create_grad == - CreateGrad::NO) { - return false; - } - - ArrayShape tensor_guid_array_shape = array_shape_from_tensor_shape( - tensor_attrs.at(tensor_to_optimizers.first).shape); - for (optimizer_tensor_t const &optimizer_tensor : - tensor_to_optimizers.second) { - if (is_allocated_tensor_backing_valid( - TensorTypeVariant{optimizer_tensor}, - allocated_tensors.tensor_type_backings, - tensor_guid_array_shape)) { - tensors_in_mappings.insert(TensorTypeVariant{optimizer_tensor}); - } else { - return false; - } - } - } - } - - for (TensorTypeVariant const &tensor_type : - keys(allocated_tensors.tensor_type_backings)) { - if (tensor_type.has()) { - if (!tensors_in_mappings.count(tensor_type)) { - return false; - } - } - } - - return true; -} - -bool are_allocated_tensors_valid( - AllocatedTensors const &allocated_tensors, - std::unordered_map const &tensor_attrs) { - return are_allocated_forward_tensors_valid(allocated_tensors, tensor_attrs) && - are_allocated_gradient_tensors_valid(allocated_tensors, - tensor_attrs) && - are_allocated_optimizer_tensors_valid(allocated_tensors, tensor_attrs); -} - -AllocatedTensors make_empty_allocated_tensors() { - return AllocatedTensors{{}, {}, {}}; -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/local_args_backing.cc b/lib/local-execution/src/local-execution/local_args_backing.cc new file mode 100644 index 0000000000..a672b9d164 --- /dev/null +++ b/lib/local-execution/src/local-execution/local_args_backing.cc @@ -0,0 +1,62 @@ +#include "local-execution/local_args_backing.h" +#include "local-execution/local_task_registry.h" +#include "local-execution/local_tensor_backing.h" +#include "op-attrs/parallel_tensor_shape.h" +#include "task-spec/op_task_to_task_invocation.h" +#include "task-spec/task_signature_impl.h" +#include "task-spec/training_computation_graph.h" +#include "task-spec/training_layer_plus_context.h" +#include "utils/containers/contains_key.h" +#include "utils/containers/generate_map.h" +#include "utils/containers/map_values.h" +#include "utils/containers/try_at.h" +#include "utils/overload.h" + +namespace FlexFlow { + +std::optional get_per_device_op_state_if_exists( + LocalArgsBacking const &local_args_backing, + layer_guid_t const &layer_guid) { + + return local_args_backing.per_device_op_states.at(layer_guid); +} + +std::unordered_map + construct_arg_slots_backing(TaskBinding const &binding, + RuntimeArgConfig const &runtime_arg_config) { + return map_values( + binding.get_arg_bindings(), [&](TaskArgSpec const &arg_binding) { + return arg_binding.template visit( + overload{[&](RuntimeArgRefSpec const &s) { + return lower_to_concrete_arg_spec(s, runtime_arg_config); + }, + [](ConcreteArgSpec const &s) { return s; }}); + }); + ; +} + +TaskArgumentAccessor + get_task_arg_accessor(LocalTensorBacking const &local_tensor_backing, + RuntimeArgConfig const &runtime_arg_config, + TaskInvocation const &invocation, + Allocator &allocator) { + std::unordered_map + tensor_slots_backing = construct_tensor_slots_backing_for_binding( + local_tensor_backing, invocation.binding); + std::unordered_map arg_slots_backing = + construct_arg_slots_backing(invocation.binding, runtime_arg_config); + return TaskArgumentAccessor::create( + allocator, tensor_slots_backing, arg_slots_backing); +} + +LocalArgsBacking make_local_args_backing_for_computation_graph( + RuntimeArgConfig const &runtime_arg_config, + std::unordered_map> const & + per_device_op_states) { + return LocalArgsBacking{ + runtime_arg_config, + per_device_op_states, + }; +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/local_cost_estimator.cc b/lib/local-execution/src/local-execution/local_cost_estimator.cc new file mode 100644 index 0000000000..6517dbfdbc --- /dev/null +++ b/lib/local-execution/src/local-execution/local_cost_estimator.cc @@ -0,0 +1,165 @@ +#include "local-execution/local_cost_estimator.h" +#include "kernels/create_local_allocator_for_device_type.h" +#include "kernels/device.h" +#include "kernels/local_cpu_allocator.h" +#include "kernels/local_cuda_allocator.h" +#include "local-execution/local_training_backing.h" +#include "local-execution/tracked_allocator.h" +#include "op-attrs/computation_graph_op_attrs.h" +#include "op-attrs/pcg_operator_attrs.h" +#include "pcg/computation_graph.h" +#include "pcg/computation_graph/layer_added_result.dtg.h" +#include "pcg/machine_view.dtg.h" +#include "pcg/parallel_tensor_attrs.h" +#include "task-spec/forward_tensor_source.h" +#include "task-spec/gradient_tensor_source.h" +#include "task-spec/optimizer_tensor_source.h" +#include "task-spec/training_computation_graph.h" +#include "utils/containers/concat_vectors.h" +#include "utils/containers/get_only.h" +#include "utils/containers/sum.h" +#include "utils/containers/transform.h" +#include "utils/containers/values.h" + +namespace FlexFlow { + +LocalCostEstimator::LocalCostEstimator(RuntimeArgConfig const &config) + : runtime_arg_config(config) {} + +static TrainingComputationGraph + create_computation_graph_for_local_cost_estimation( + PCGOperatorAttrs const &op, + OptimizerAttrs const &optimizer_attrs, + std::vector const &inputs, + std::vector const &weights, + std::vector const &outputs) { + ComputationGraph computation_graph = make_empty_computation_graph(); + + std::vector input_tensors; + for (ParallelTensorShape const &input : inputs) { + LayerAddedResult inputs_layer = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{InputAttrs{get_piece_shape(input)}}, + std::nullopt}, + {}, + {}); + input_tensors.push_back(get_only(inputs_layer.outputs)); + } + + std::vector weight_tensors; + for (ParallelTensorShape const &weight : weights) { + LayerAddedResult weights_layer = + add_layer(computation_graph, + LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{ + get_piece_shape(weight), + InitializerAttrs{ZeroInitializerAttrs{}}}}, + std::nullopt}, + {}, + {}); + weight_tensors.push_back(get_only(weights_layer.outputs)); + } + + // create operator layer + LayerAddedResult operator_layer = add_layer( + computation_graph, + LayerAttrs{compgraph_op_attrs_from_pcg_op_attrs(op), "operator"}, + input_tensors, + weight_tensors); + + ForwardTensorSource forward_tensor_source; + GradientTensorSource gradient_tensor_source; + OptimizerTensorSource optimizer_tensor_source; + LossTensorSource loss_tensor_source; + + TrainingComputationGraph training_cg = generate_training_computation_graph( + /*computation_graph=*/computation_graph, + /*optimizer_attrs=*/optimizer_attrs, + /*logit_tensor=*/operator_layer.outputs.at(0), + /*forward_tensor_source=*/forward_tensor_source, + /*gradient_tensor_source=*/gradient_tensor_source, + /*optimizer_tensor_source=*/optimizer_tensor_source, + /*loss_tensor_source=*/loss_tensor_source); + + return training_cg; +} + +OpCostMetrics LocalCostEstimator::estimate_cost( + OpCostEstimateKey const &op_cost_estimate_key) const { + + PCGOperatorAttrs op = op_cost_estimate_key.op_attrs; + std::vector inputs = op_cost_estimate_key.input_shapes; + std::vector weights = op_cost_estimate_key.weight_shapes; + std::vector outputs = op_cost_estimate_key.output_shapes; + MachineView mv = op_cost_estimate_key.machine_view; + + if (is_parallel_op(op) || op.has() || op.has() || + op.has()) { + return OpCostMetrics{ + /*forward_runtime=*/0_ms, + /*backward_runtime=*/0_ms, + /*memory=*/0_bytes, + }; + } + + TrainingComputationGraph training_cg = + create_computation_graph_for_local_cost_estimation( + /*op=*/op, + /*optimizer_attrs=*/op_cost_estimate_key.optimizer_attrs, + /*inputs=*/inputs, + /*weights=*/weights, + /*outputs=*/outputs); + + // allocate memory + std::shared_ptr tracked_allocator_ptr = + std::make_shared(create_local_allocator_for_device_type( + runtime_arg_config.kernel_device_type)); + Allocator allocator = Allocator(tracked_allocator_ptr); + + LocalTrainingBacking local_backing = + make_local_training_backing_for_computation_graph( + /*allocator=*/allocator, + /*preallocated_tensors=*/{}, + /*training_computation_graph=*/training_cg, + /*runtime_arg_config=*/this->runtime_arg_config, + /*optimizer_attrs=*/op_cost_estimate_key.optimizer_attrs); + + // execute layer + layer_guid_t operator_layer_guid = + get_layer_by_name(training_cg.computation_graph, "operator"); + + milliseconds_t fwd = execute_forward(local_backing.local_task_registry, + local_backing.local_tensor_backing, + local_backing.local_args_backing, + get_training_layer_plus_context( + training_cg, operator_layer_guid), + allocator) + .value(); + milliseconds_t bwd = execute_backward(local_backing.local_task_registry, + local_backing.local_tensor_backing, + local_backing.local_args_backing, + get_training_layer_plus_context( + training_cg, operator_layer_guid), + allocator) + .value(); + + return OpCostMetrics{ + /*forward_runtime=*/fwd, + /*backward_runtime=*/bwd, + /*memory=*/tracked_allocator_ptr->get_current_mem_usage(), + }; +} + +milliseconds_t LocalCostEstimator::estimate_cost( + TensorSetMovement const &tensor_set_movement) const { + // TODO: model communication cost analytically + // https://github.com/flexflow/FlexFlow/issues/1414 + + NOT_IMPLEMENTED(); +} + +CostEstimator + get_local_cost_estimator(RuntimeArgConfig const &runtime_arg_config) { + return CostEstimator::create(runtime_arg_config); +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/local_task_registry.cc b/lib/local-execution/src/local-execution/local_task_registry.cc new file mode 100644 index 0000000000..d482736a5b --- /dev/null +++ b/lib/local-execution/src/local-execution/local_task_registry.cc @@ -0,0 +1,64 @@ +#include "local-execution/local_task_registry.h" +#include "local-execution/operator_task_set.h" +#include "local-execution/registered_task.h" +#include "pcg/computation_graph.h" +#include "task-spec/task_signature_impl.h" +#include "utils/containers/contains_key.h" +#include "utils/containers/filtrans.h" +#include "utils/containers/flatmap.h" +#include "utils/containers/generate_map.h" +#include "utils/containers/map_values.h" +#include "utils/containers/try_at.h" +#include "utils/containers/values.h" + +namespace FlexFlow { + +LocalTaskRegistry construct_local_task_registry_for_layers( + std::unordered_map const &layer_attrs_mapping) { + + std::unordered_map task_sets = + map_values(layer_attrs_mapping, [](LayerAttrs const &layer_attrs) { + return get_task_set_for_operator(layer_attrs.op_attrs); + }); + + std::unordered_set all_tasks = + flatmap(unordered_set_of(values(task_sets)), get_all_tasks_in_task_set); + + std::unordered_set all_real_tasks = + filtrans(all_tasks, [](registered_task_t const &t) { + return t.try_require_real_task(); + }); + + std::unordered_map task_mapping = + generate_map(all_real_tasks, get_task_signature_and_impl_for_task_id); + + return LocalTaskRegistry{ + /*task_sets=*/task_sets, + /*task_mapping=*/task_mapping, + }; +} + +std::optional + try_get_registered_task(LocalTaskRegistry const &task_registry, + layer_guid_t const &layer_guid, + OpTaskType const &op_task_type) { + if (!contains_key(task_registry.task_sets, layer_guid)) { + return std::nullopt; + } + + return get_task_for_task_type(task_registry.task_sets.at(layer_guid), + op_task_type); +} + +std::optional + call_task_impl(LocalTaskRegistry const &task_registry, + task_id_t const &task_id, + TaskArgumentAccessor const &acc) { + TaskSignatureAndImpl task_sig_impl = task_registry.task_mapping.at(task_id); + auto fn = + task_sig_impl.impl_function.get().function_ptr; + return transform( + fn(acc), [](float running_time) { return milliseconds_t{running_time}; }); +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/local_tensor_backing.cc b/lib/local-execution/src/local-execution/local_tensor_backing.cc new file mode 100644 index 0000000000..be8e44736c --- /dev/null +++ b/lib/local-execution/src/local-execution/local_tensor_backing.cc @@ -0,0 +1,74 @@ +#include "local-execution/local_tensor_backing.h" +#include "op-attrs/parallel_tensor_shape.h" +#include "pcg/computation_graph.h" +#include "pcg/optimizer_attrs.h" +#include "task-spec/slot_grad_id.dtg.h" +#include "task-spec/training_computation_graph.h" +#include "utils/containers/contains_key.h" +#include "utils/containers/generate_map.h" +#include "utils/containers/is_submapeq_of.h" +#include "utils/containers/is_subseteq_of.h" +#include "utils/containers/keys.h" +#include "utils/containers/map_values.h" +#include "utils/containers/merge_maps.h" +#include "utils/containers/set_minus.h" +#include "utils/containers/set_of.h" +#include "utils/overload.h" + +namespace FlexFlow { + +LocalTensorBacking construct_local_tensor_backing( + std::unordered_map const + &training_tensor_shapes, + std::unordered_map const + &preallocated, + Allocator &allocator) { + + ASSERT(is_subseteq_of(keys(preallocated), keys(training_tensor_shapes))); + + std::unordered_set to_allocate = + set_minus(keys(training_tensor_shapes), keys(preallocated)); + + std::unordered_map allocated = + generate_map(to_allocate, [&](training_tensor_guid_t t) { + TensorShape shape = training_tensor_shapes.at(t); + return allocator.allocate_tensor(shape); + }); + + std::unordered_map + backing_for_training_tensor_map = + merge_disjoint_maps(allocated, preallocated); + + ASSERT(is_submapeq_of(preallocated, backing_for_training_tensor_map)); + + ASSERT(keys(backing_for_training_tensor_map) == keys(training_tensor_shapes), + backing_for_training_tensor_map.size(), + training_tensor_shapes.size(), + keys(preallocated)); + + return LocalTensorBacking{ + backing_for_training_tensor_map, + }; +} + +GenericTensorAccessorW get_accessor_for_training_tensor( + LocalTensorBacking const &local_tensor_backing, + training_tensor_guid_t training_tensor) { + return local_tensor_backing.backing_for_training_tensor_map.at( + training_tensor); +} + +std::unordered_map + construct_tensor_slots_backing_for_binding( + LocalTensorBacking const &local_tensor_backing, + TaskBinding const &binding) { + + return map_values( + binding.get_tensor_bindings(), [&](training_tensor_guid_t t) { + return TensorSlotBacking{ + get_accessor_for_training_tensor(local_tensor_backing, t), + }; + }); +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/local_training_backing.cc b/lib/local-execution/src/local-execution/local_training_backing.cc new file mode 100644 index 0000000000..9c67d3acd3 --- /dev/null +++ b/lib/local-execution/src/local-execution/local_training_backing.cc @@ -0,0 +1,221 @@ +#include "local-execution/local_training_backing.h" +#include "local-execution/local_args_backing.h" +#include "pcg/computation_graph.h" +#include "pcg/optimizer_attrs.h" +#include "task-spec/loss_functions.h" +#include "task-spec/op_task_to_task_invocation.h" +#include "task-spec/optimizer.h" +#include "task-spec/task_invocation.h" +#include "task-spec/task_signature_impl.h" +#include "task-spec/training_computation_graph.h" +#include "utils/containers/contains.h" +#include "utils/containers/contains_key.h" +#include "utils/containers/get_only.h" +#include "utils/containers/is_subseteq_of.h" +#include "utils/containers/keys.h" +#include "utils/containers/values.h" +#include "utils/exception.h" + +namespace FlexFlow { + +LocalTrainingBacking make_local_training_backing_for_computation_graph( + Allocator &allocator, + std::unordered_map const + &preallocated, + TrainingComputationGraph const &training_computation_graph, + RuntimeArgConfig const &runtime_arg_config, + OptimizerAttrs const &optimizer_attrs) { + + ASSERT(is_subseteq_of( + keys(preallocated), + keys(get_all_training_tensor_shapes(training_computation_graph)))); + + LocalTaskRegistry local_task_registry = + construct_local_task_registry_for_layers(get_layer_attrs_mapping( + training_computation_graph.computation_graph)); + + LocalTensorBacking local_tensor_backing = construct_local_tensor_backing( + get_all_training_tensor_shapes(training_computation_graph), + preallocated, + allocator); + + std::unordered_map> + per_device_op_states = generate_map( + topological_ordering(training_computation_graph.computation_graph), + [&](layer_guid_t const &layer_guid) { + return create_per_device_op_state( + local_task_registry, + local_tensor_backing, + runtime_arg_config, + allocator, + get_training_layer_plus_context(training_computation_graph, + layer_guid)); + }); + + LocalArgsBacking local_args_backing = + make_local_args_backing_for_computation_graph(runtime_arg_config, + per_device_op_states); + + return LocalTrainingBacking{ + /*computation_graph=*/training_computation_graph, + /*local_task_registry=*/local_task_registry, + /*local_tensor_backing=*/local_tensor_backing, + /*local_args_backing=*/local_args_backing, + }; +} + +std::optional + create_per_device_op_state(LocalTaskRegistry const &local_task_registry, + LocalTensorBacking const &tensor_backing, + RuntimeArgConfig const &runtime_arg_config, + Allocator &allocator, + TrainingLayerPlusContext const &training_layer) { + std::optional maybe_registered_task = try_get_registered_task( + local_task_registry, training_layer.layer_guid, OpTaskType::INIT); + + ASSERT(maybe_registered_task.has_value()); + + registered_task_t registered_task = maybe_registered_task.value(); + if (registered_task.is_noop_task()) { + return std::nullopt; + } + + TaskInvocation invocation = lower_to_task_invocation( + /*op_task_invocation=*/get_init_op_task_invocation( + training_layer.layer_attrs.op_attrs), + /*training_layer=*/training_layer, + /*device_specific_device_states=*/std::nullopt); + + TaskArgumentAccessor accessor = get_task_arg_accessor( + tensor_backing, runtime_arg_config, invocation, allocator); + TaskSignatureAndImpl task_sig_impl = + local_task_registry.task_mapping.at(invocation.task_id); + auto fn = + task_sig_impl.impl_function.get().function_ptr; + DeviceSpecificDeviceStates device_state = fn(accessor); + return device_state; +} + +std::optional + execute_forward(LocalTaskRegistry const &local_task_registry, + LocalTensorBacking const &local_tensor_backing, + LocalArgsBacking const &local_args_backing, + TrainingLayerPlusContext const &training_layer, + Allocator &allocator) { + + std::optional maybe_registered_task = try_get_registered_task( + local_task_registry, training_layer.layer_guid, OpTaskType::BWD); + + ASSERT(maybe_registered_task.has_value()); + + registered_task_t registered_task = maybe_registered_task.value(); + if (registered_task.is_noop_task()) { + return std::nullopt; + } + + std::optional device_state = + get_per_device_op_state_if_exists(local_args_backing, + training_layer.layer_guid); + + TaskInvocation invocation = lower_to_task_invocation( + /*op_task_invocation=*/get_forward_op_task_invocation( + training_layer.layer_attrs.op_attrs), + /*training_layer=*/training_layer, + /*device_specific_device_states=*/device_state); + + TaskArgumentAccessor accessor = + get_task_arg_accessor(local_tensor_backing, + local_args_backing.runtime_arg_config, + invocation, + allocator); + return call_task_impl(local_task_registry, invocation.task_id, accessor); +} + +void compute_loss(LocalTrainingBacking const &local_training_backing, + LossAttrs const &loss_attrs, + Allocator &allocator) { + + TrainingComputationGraph training_cg = + local_training_backing.training_computation_graph; + tensor_guid_t logit_tensor = training_cg.logit_tensor; + loss_tensor_guid_t label_tensor = training_cg.label_tensor; + + TaskInvocation loss_invocation = backward( + loss_attrs, + get_forward_tensor_guid_for_tensor_guid(training_cg, logit_tensor), + get_gradient_tensor_guid_for_tensor_guid(training_cg, logit_tensor), + label_tensor); + // TODO: https://github.com/flexflow/flexflow-train/issues/1442 + // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation)); + TaskArgumentAccessor loss_accessor = get_task_arg_accessor( + local_training_backing.local_tensor_backing, + local_training_backing.local_args_backing.runtime_arg_config, + loss_invocation, + allocator); + TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl(); + loss_impl_fn.get().function_ptr(loss_accessor); +} + +std::optional + execute_backward(LocalTaskRegistry const &local_task_registry, + LocalTensorBacking const &local_tensor_backing, + LocalArgsBacking const &local_args_backing, + TrainingLayerPlusContext const &training_layer, + Allocator &allocator) { + + std::optional maybe_registered_task = try_get_registered_task( + local_task_registry, training_layer.layer_guid, OpTaskType::BWD); + + ASSERT(maybe_registered_task.has_value()); + + registered_task_t registered_task = maybe_registered_task.value(); + if (registered_task.is_noop_task()) { + return std::nullopt; + } + + std::optional device_state = + get_per_device_op_state_if_exists(local_args_backing, + training_layer.layer_guid); + TaskInvocation invocation = lower_to_task_invocation( + get_backward_op_task_invocation(training_layer.layer_attrs.op_attrs), + training_layer, + device_state); + TaskArgumentAccessor accessor = + get_task_arg_accessor(local_tensor_backing, + local_args_backing.runtime_arg_config, + invocation, + allocator); + return call_task_impl(local_task_registry, invocation.task_id, accessor); +} + +void execute_update(LocalTrainingBacking const &local_training_backing, + layer_guid_t const &layer_guid, + OptimizerAttrs const &optimizer_attrs, + Allocator &allocator) { + TrainingLayerPlusContext training_layer = get_training_layer_plus_context( + local_training_backing.training_computation_graph, layer_guid); + + if (training_layer.layer_attrs.op_attrs.has()) { + TrainingTensorGroupWithAttrs weight_tensor_group = + get_only(training_layer.output_tensor_groups); + + TaskInvocation invocation = + get_update_invocation(optimizer_attrs, + weight_tensor_group.forward_tensor, + weight_tensor_group.gradient_tensor, + weight_tensor_group.optimizer_tensors); + + // TODO: https://github.com/flexflow/flexflow-train/issues/1442 + // assert(is_invocation_valid(get_update_signature(attrs), invocation)); + + TaskArgumentAccessor accessor = get_task_arg_accessor( + local_training_backing.local_tensor_backing, + local_training_backing.local_args_backing.runtime_arg_config, + invocation, + allocator); + TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs); + update_impl_fn.get().function_ptr(accessor); + } +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/model_training_instance.cc b/lib/local-execution/src/local-execution/model_training_instance.cc new file mode 100644 index 0000000000..be2791a365 --- /dev/null +++ b/lib/local-execution/src/local-execution/model_training_instance.cc @@ -0,0 +1,85 @@ +#include "local-execution/model_training_instance.h" +#include "pcg/computation_graph.h" +#include "pcg/optimizer_attrs.h" +#include "task-spec/training_computation_graph.h" +#include "utils/containers/reversed.h" + +namespace FlexFlow { + +ModelTrainingInstance::ModelTrainingInstance( + Allocator const &allocator, + LocalTrainingBacking const &local_training_backing, + LossAttrs const &loss_attrs, + OptimizerAttrs const &optimizer_attrs) + : allocator(allocator), training_backing(local_training_backing), + loss_attrs(loss_attrs), optimizer_attrs(optimizer_attrs) {} + +std::unordered_map> + ModelTrainingInstance::forward() { + + std::unordered_map> + per_layer_elapsed_time; + + for (layer_guid_t const &layer_guid : + topological_ordering(this->training_backing.training_computation_graph + .computation_graph)) { + std::optional elapsed_time = execute_forward( + this->training_backing.local_task_registry, + this->training_backing.local_tensor_backing, + this->training_backing.local_args_backing, + get_training_layer_plus_context( + this->training_backing.training_computation_graph, layer_guid), + this->allocator); + + per_layer_elapsed_time.insert({layer_guid, elapsed_time}); + } + + return per_layer_elapsed_time; +} + +std::unordered_map> + ModelTrainingInstance::backward() { + compute_loss(this->training_backing, this->loss_attrs, this->allocator); + + std::unordered_map> + per_layer_elapsed_time; + for (layer_guid_t const &layer_guid : reversed(topological_ordering( + this->training_backing.training_computation_graph + .computation_graph))) { + std::optional elapsed_time = execute_backward( + this->training_backing.local_task_registry, + this->training_backing.local_tensor_backing, + this->training_backing.local_args_backing, + get_training_layer_plus_context( + this->training_backing.training_computation_graph, layer_guid), + this->allocator); + per_layer_elapsed_time.insert({layer_guid, elapsed_time}); + } + return per_layer_elapsed_time; +} + +void ModelTrainingInstance::update() { + for (layer_guid_t const &layer_guid : + topological_ordering(this->training_backing.training_computation_graph + .computation_graph)) { + execute_update(this->training_backing, + layer_guid, + this->optimizer_attrs, + this->allocator); + } + this->optimizer_attrs = + get_optimizer_attrs_for_next_iter(this->optimizer_attrs); +} + +GenericTensorAccessorR ModelTrainingInstance::get_loss_tensor_accessor() const { + gradient_tensor_guid_t loss_tensor = get_gradient_tensor_guid_for_tensor_guid( + this->training_backing.training_computation_graph, + this->training_backing.training_computation_graph.logit_tensor); + GenericTensorAccessorW loss_tensor_backing = + this->training_backing.local_tensor_backing + .backing_for_training_tensor_map.at( + training_tensor_guid_t{loss_tensor}); + return read_only_accessor_from_write_accessor(loss_tensor_backing); +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/operator_task_set.cc b/lib/local-execution/src/local-execution/operator_task_set.cc new file mode 100644 index 0000000000..8dbc8791c6 --- /dev/null +++ b/lib/local-execution/src/local-execution/operator_task_set.cc @@ -0,0 +1,71 @@ +#include "local-execution/operator_task_set.h" +#include "local-execution/registered_task.h" +#include "task-spec/task_signature_impl.h" +#include "utils/bidict/algorithms/right_entries.h" +#include "utils/containers/values.h" + +namespace FlexFlow { + +bidict + get_map_from_task_type_to_task(OperatorTaskSet const &op_task_set) { + return { + {OpTaskType::INIT, op_task_set.init_task}, + {OpTaskType::FWD, op_task_set.fwd_task}, + {OpTaskType::BWD, op_task_set.bwd_task}, + }; +} + +std::unordered_set + get_all_tasks_in_task_set(OperatorTaskSet const &op_task_set) { + return right_entries(get_map_from_task_type_to_task(op_task_set)); +} + +registered_task_t get_task_for_task_type(OperatorTaskSet const &op_task_set, + OpTaskType task_type) { + return get_map_from_task_type_to_task(op_task_set).at_l(task_type); +} + +OperatorTaskSet + get_task_set_for_operator(ComputationGraphOpAttrs const &attrs) { + registered_task_t init_task = make_noop_registered_task(); + registered_task_t fwd_task = make_noop_registered_task(); + registered_task_t bwd_task = make_noop_registered_task(); + + std::vector task_ids = get_task_ids(attrs); + + for (task_id_t const &task_id : task_ids) { + TaskSignatureAndImpl task_signature_and_impl = + get_task_signature_and_impl_for_task_id(task_id); + + TaskImplFunction task_impl_function = task_signature_and_impl.impl_function; + OpTaskSignature task_signature = task_signature_and_impl.task_signature; + + switch (task_signature.type) { + case OpTaskType::INIT: + ASSERT(is_invocation_valid(task_signature, + get_init_op_task_invocation(attrs))); + init_task = registered_task_t{task_id}; + break; + case OpTaskType::FWD: + ASSERT(is_invocation_valid(task_signature, + get_forward_op_task_invocation(attrs))); + fwd_task = registered_task_t{task_id}; + break; + case OpTaskType::BWD: + ASSERT(is_invocation_valid(task_signature, + get_backward_op_task_invocation(attrs))); + bwd_task = registered_task_t{task_id}; + break; + default: + PANIC("Unhandled OpTaskType", fmt::to_string(task_signature.type)); + } + } + + return OperatorTaskSet{ + /*init_task=*/init_task, + /*fwd_task=*/fwd_task, + /*bwd_task=*/bwd_task, + }; +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/local-execution/registered_task.cc b/lib/local-execution/src/local-execution/registered_task.cc new file mode 100644 index 0000000000..84b116273a --- /dev/null +++ b/lib/local-execution/src/local-execution/registered_task.cc @@ -0,0 +1,9 @@ +#include "local-execution/registered_task.h" + +namespace FlexFlow { + +registered_task_t make_noop_registered_task() { + return registered_task_t{std::monostate{}}; +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/local_args_backing.cc b/lib/local-execution/src/local_args_backing.cc deleted file mode 100644 index 4a342767b2..0000000000 --- a/lib/local-execution/src/local_args_backing.cc +++ /dev/null @@ -1,46 +0,0 @@ -#include "local-execution/local_args_backing.h" -#include "op-attrs/parallel_tensor_shape.h" -#include "task-spec/op_task_to_task_invocation.h" -#include "utils/containers/contains_key.h" -#include "utils/containers/map_values.h" -#include "utils/overload.h" - -namespace FlexFlow { - -LocalArgsBacking make_args_backing_with_empty_device_states( - RuntimeArgConfig const &runtime_arg_config) { - return LocalArgsBacking{runtime_arg_config, {}}; -} - -LocalArgsBacking::LocalArgsBacking( - RuntimeArgConfig const &runtime_arg_config, - std::unordered_map const - &device_states) - : runtime_arg_config(runtime_arg_config), - per_device_op_states(device_states){}; - -std::optional get_per_device_op_state_if_exists( - LocalArgsBacking const &local_args_backing, - layer_guid_t const &layer_guid) { - if (contains_key(local_args_backing.per_device_op_states, layer_guid)) { - return local_args_backing.per_device_op_states.at(layer_guid); - } else { - return std::nullopt; - } -} - -ArgSlotsBacking - construct_arg_slots_backing(TaskBinding const &binding, - RuntimeArgConfig const &runtime_arg_config) { - return map_values( - binding.get_arg_bindings(), [&](TaskArgSpec const &arg_binding) { - return arg_binding.template visit( - overload{[&](RuntimeArgRefSpec const &s) { - return lower_to_concrete_arg_spec(s, runtime_arg_config); - }, - [](ConcreteArgSpec const &s) { return s; }}); - }); - ; -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/local_cost_estimator.cc b/lib/local-execution/src/local_cost_estimator.cc deleted file mode 100644 index 85f315c7d1..0000000000 --- a/lib/local-execution/src/local_cost_estimator.cc +++ /dev/null @@ -1,122 +0,0 @@ -#include "local-execution/local_cost_estimator.h" -#include "kernels/device.h" -#include "kernels/local_cuda_allocator.h" -#include "local-execution/tracked_allocator.h" -#include "op-attrs/computation_graph_op_attrs.h" -#include "op-attrs/pcg_operator_attrs.h" -#include "pcg/computation_graph.h" -#include "pcg/computation_graph/layer_added_result.dtg.h" -#include "pcg/machine_view.dtg.h" -#include "pcg/parallel_tensor_attrs.h" -#include "utils/containers/concat_vectors.h" -#include "utils/containers/get_only.h" -#include "utils/containers/sum.h" -#include "utils/containers/transform.h" -#include "utils/containers/values.h" - -namespace FlexFlow { - -LocalCostEstimator::LocalCostEstimator(RuntimeArgConfig const &config) - : runtime_arg_config(config) {} - -static ComputationGraph create_computation_graph_for_local_cost_estimation( - PCGOperatorAttrs const &op, - std::vector const &inputs, - std::vector const &weights, - std::vector const &outputs) { - ComputationGraph computation_graph = make_empty_computation_graph(); - - std::vector input_tensors; - for (ParallelTensorShape const &input : inputs) { - LayerAddedResult inputs_layer = add_layer( - computation_graph, - LayerAttrs{ComputationGraphOpAttrs{InputAttrs{get_piece_shape(input)}}, - std::nullopt}, - {}, - {}); - input_tensors.push_back(get_only(inputs_layer.outputs)); - } - - std::vector weight_tensors; - for (ParallelTensorAttrs const &weight : weights) { - LayerAddedResult weights_layer = - add_layer(computation_graph, - LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{ - get_piece_shape(weight.shape), - InitializerAttrs{ZeroInitializerAttrs{}}}}, - std::nullopt}, - {}, - {}); - weight_tensors.push_back(get_only(weights_layer.outputs)); - } - - // create operator layer - LayerAddedResult operator_layer = add_layer( - computation_graph, - LayerAttrs{compgraph_op_attrs_from_pcg_op_attrs(op), "operator"}, - input_tensors, - weight_tensors); - - return computation_graph; -} - -CostDetails LocalCostEstimator::estimate_cost( - PCGOperatorAttrs const &op, - std::vector const &inputs, - std::vector const &weights, - std::vector const &outputs, - MachineView const &mv) const { - - if (is_parallel_op(op) || op.has() || op.has() || - op.has()) { - return CostDetails{0, 0}; - } - - // construct computation graph - ComputationGraph computation_graph = - create_computation_graph_for_local_cost_estimation( - op, inputs, weights, outputs); - - // allocate memory - std::shared_ptr tracked_allocator_ptr = - std::make_shared(create_local_cuda_memory_allocator()); - Allocator allocator = Allocator(tracked_allocator_ptr); - - GradientTensorSource gradient_tensor_source; - - LocalTrainingBacking local_backing(allocator, - AllocatedTensors{{}, {}, {}}, - gradient_tensor_source, - computation_graph, - this->runtime_arg_config); - // execute layer - layer_guid_t operator_layer_guid = - get_layer_by_name(computation_graph, "operator"); - - float fwd = - execute_forward(local_backing, operator_layer_guid, allocator).value(); - float bwd = - execute_backward(local_backing, operator_layer_guid, allocator).value(); - - float total_execution_time = fwd + bwd; - - return CostDetails{total_execution_time, - tracked_allocator_ptr->get_current_mem_usage()}; -} - -float LocalCostEstimator::estimate_cost(ParallelTensorShape const &tensor_shape, - MachineView const &src, - MachineView const &dst) const { - // TODO: model communication cost analytically - // https://github.com/flexflow/FlexFlow/issues/1414 - // temporarily return 0 - - return 0.0; -} - -CostEstimator - get_local_cost_estimator(RuntimeArgConfig const &runtime_arg_config) { - return CostEstimator::create(runtime_arg_config); -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/local_task_argument_accessor.cc b/lib/local-execution/src/local_task_argument_accessor.cc index 2e82378fdb..207305a8db 100644 --- a/lib/local-execution/src/local_task_argument_accessor.cc +++ b/lib/local-execution/src/local_task_argument_accessor.cc @@ -8,8 +8,9 @@ namespace FlexFlow { LocalTaskArgumentAccessor::LocalTaskArgumentAccessor( Allocator const &allocator, - TensorSlotsBacking const &tensor_slots_backing, - ArgSlotsBacking const &arg_slots_backing) + std::unordered_map const + &tensor_slots_backing, + std::unordered_map const &arg_slots_backing) : allocator(allocator), tensor_slots_backing(tensor_slots_backing), arg_slots_backing(arg_slots_backing){}; @@ -20,9 +21,10 @@ ConcreteArgSpec const & GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor( slot_id_t slot, Permissions priv, TensorType tensor_type) const { - SlotTensorTypeId slot_tensor_type = SlotTensorTypeId{slot, tensor_type}; - auto tensor_backing = std::get( - this->tensor_slots_backing.at(slot_tensor_type)); + tensor_sub_slot_id_t slot_tensor_type = + tensor_sub_slot_id_t{slot, tensor_type}; + GenericTensorAccessorW tensor_backing = + this->tensor_slots_backing.at(slot_tensor_type).require_single(); if (priv == Permissions::RO) { GenericTensorAccessorR readonly_tensor_backing = read_only_accessor_from_write_accessor(tensor_backing); @@ -30,15 +32,16 @@ GenericTensorAccessor LocalTaskArgumentAccessor::get_tensor( } else if (priv == Permissions::RW || priv == Permissions::WO) { return tensor_backing; } else { - throw mk_runtime_error(fmt::format("Unhandled privilege mode {}", priv)); + PANIC(fmt::format("Unhandled privilege mode {}", priv)); } } VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor( slot_id_t slot, Permissions priv, TensorType tensor_type) const { - SlotTensorTypeId slot_tensor_type = SlotTensorTypeId{slot, tensor_type}; - auto variadic_tensor_backing = std::get>( - this->tensor_slots_backing.at(slot_tensor_type)); + tensor_sub_slot_id_t slot_tensor_type = + tensor_sub_slot_id_t{slot, tensor_type}; + std::vector variadic_tensor_backing = + this->tensor_slots_backing.at(slot_tensor_type).require_variadic(); if (priv == Permissions::RO) { std::vector readonly_variadic_tensor_backing = {}; for (GenericTensorAccessorW const &tensor_backing : @@ -50,7 +53,7 @@ VariadicGenericTensorAccessor LocalTaskArgumentAccessor::get_variadic_tensor( } else if (priv == Permissions::RW || priv == Permissions::WO) { return variadic_tensor_backing; } else { - throw mk_runtime_error(fmt::format("Unhandled privilege mode {}", priv)); + PANIC(fmt::format("Unhandled privilege mode {}", priv)); } } diff --git a/lib/local-execution/src/local_tensor_backing.cc b/lib/local-execution/src/local_tensor_backing.cc deleted file mode 100644 index 629117508f..0000000000 --- a/lib/local-execution/src/local_tensor_backing.cc +++ /dev/null @@ -1,95 +0,0 @@ -#include "local-execution/local_tensor_backing.h" -#include "op-attrs/parallel_tensor_shape.h" -#include "pcg/computation_graph.h" -#include "pcg/optimizer_attrs.h" -#include "task-spec/slot_grad_id.dtg.h" -#include "utils/containers/contains_key.h" -#include "utils/containers/keys.h" -#include "utils/overload.h" - -namespace FlexFlow { - -GenericTensorAccessorW - get_tensor(LocalTensorBacking const &local_tensor_backing, - TensorTypeVariant const &tensor_type) { - return local_tensor_backing.tensor_backings.at(tensor_type); -} - -std::unordered_map> - merge_optimizer_mappings( - std::unordered_map> const - &allocated, - std::unordered_map> const - &unallocated) { - std::unordered_map> - merged_maps = allocated; - for (std::pair> const - &unallocated_optimizer_tensors : unallocated) { - if (merged_maps.count(unallocated_optimizer_tensors.first)) { - for (optimizer_tensor_t const &optimizer_tensor : - unallocated_optimizer_tensors.second) { - merged_maps[unallocated_optimizer_tensors.first].push_back( - optimizer_tensor); - } - } else { - merged_maps.insert({unallocated_optimizer_tensors}); - } - } - return merged_maps; -} - -std::unordered_map - get_tensor_backings( - std::unordered_map const - &tensor_type_backings, - std::unordered_map const - &tensor_type_shapes, - Allocator &allocator) { - std::unordered_map - all_tensor_backings = tensor_type_backings; - - // allocate new tensors - for (std::pair const &tensor_type_shape : - tensor_type_shapes) { - GenericTensorAccessorW tensor_backing = - allocator.allocate_tensor(tensor_type_shape.second); - all_tensor_backings.insert({tensor_type_shape.first, tensor_backing}); - } - - return all_tensor_backings; -} - -LocalTensorBacking construct_local_tensor_backing( - AllocatedTensors const &allocated_tensors, - UnallocatedTensors const &unallocated_tensors, - Allocator &allocator) { - - std::unordered_map merged_gradient_maps = - allocated_tensors.gradient_mapping; - merged_gradient_maps.insert(unallocated_tensors.gradient_mapping.begin(), - unallocated_tensors.gradient_mapping.end()); - - return LocalTensorBacking{ - get_tensor_backings(allocated_tensors.tensor_type_backings, - unallocated_tensors.tensor_type_shapes, - allocator), - merged_gradient_maps, - merge_optimizer_mappings(allocated_tensors.optimizer_mapping, - unallocated_tensors.optimizer_mapping)}; -} - -TensorSlotsBacking construct_tensor_slots_backing( - LocalTensorBacking const &local_tensor_backing, - TaskBinding const &binding) { - TensorSlotsBacking mapping; - - for (std::pair const &tensor_binding : - binding.get_tensor_bindings()) { - mapping.insert({tensor_binding.first, - get_tensor(local_tensor_backing, tensor_binding.second)}); - } - - return mapping; -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/local_training_backing.cc b/lib/local-execution/src/local_training_backing.cc deleted file mode 100644 index 3b1bb0fd2d..0000000000 --- a/lib/local-execution/src/local_training_backing.cc +++ /dev/null @@ -1,264 +0,0 @@ -#include "local-execution/local_training_backing.h" -#include "local-execution/loss_functions.h" -#include "local-execution/optimizer.h" -#include "local-execution/unallocated_tensors.h" -#include "pcg/computation_graph.h" -#include "pcg/optimizer_attrs.h" -#include "task-spec/op_task_to_task_invocation.h" -#include "task-spec/task_invocation.h" -#include "task-spec/task_signature_impl.h" -#include "utils/containers/contains.h" -#include "utils/containers/contains_key.h" -#include "utils/containers/get_only.h" -#include "utils/containers/values.h" -#include "utils/exception.h" - -namespace FlexFlow { - -LocalTrainingBacking::LocalTrainingBacking( - Allocator &allocator, - AllocatedTensors const &allocated_tensors, - GradientTensorSource &gradient_tensor_source, - ComputationGraph const &computation_graph, - RuntimeArgConfig const &runtime_arg_config) - : computation_graph(computation_graph), - task_registry( - construct_task_registry(get_layer_attrs_mapping(computation_graph))), - local_tensor_backing(construct_local_tensor_backing( - allocated_tensors, - generate_unallocated_tensors(allocated_tensors, - get_all_tensor_attrs(computation_graph), - gradient_tensor_source), - allocator)), - local_args_backing(initialize_args_backing(this->task_registry, - computation_graph, - runtime_arg_config, - this->local_tensor_backing, - allocator)){}; - -LocalTrainingBacking::LocalTrainingBacking( - Allocator &allocator, - AllocatedTensors const &allocated_tensors, - GradientTensorSource &gradient_tensor_source, - OptimizerTensorSource &optimizer_tensor_source, - ComputationGraph const &computation_graph, - RuntimeArgConfig const &runtime_arg_config, - OptimizerAttrs const &optimizer_attrs) - : computation_graph(computation_graph), - task_registry( - construct_task_registry(get_layer_attrs_mapping(computation_graph))), - local_tensor_backing(construct_local_tensor_backing( - allocated_tensors, - generate_unallocated_tensors_with_optimizer( - allocated_tensors, - get_all_tensor_attrs(computation_graph), - gradient_tensor_source, - optimizer_tensor_source, - optimizer_attrs), - allocator)), - local_args_backing(initialize_args_backing(this->task_registry, - computation_graph, - runtime_arg_config, - this->local_tensor_backing, - allocator)){}; -LocalArgsBacking - initialize_args_backing(TaskRegistry const &task_registry, - ComputationGraph const &cg, - RuntimeArgConfig const &runtime_arg_config, - LocalTensorBacking const &local_tensor_backing, - Allocator &allocator) { - std::unordered_map - per_device_op_states; - for (layer_guid_t const &node : topological_ordering(cg)) { - if (registry_contains_task_for_layer( - task_registry, node, OpTaskType::INIT)) { - ComputationGraphOpAttrs attrs = get_layer_attrs(cg, node).op_attrs; - - TaskInvocation invocation = - lower_to_task_invocation(init(attrs), - node, - get_incoming_inputs(cg, node), - get_incoming_input_shapes(cg, node), - get_outgoing_tensors(cg, node), - get_incoming_weights(cg, node), - local_tensor_backing.tensor_gradient_mapping, - std::nullopt); - TaskArgumentAccessor accessor = get_task_arg_accessor( - local_tensor_backing, - make_args_backing_with_empty_device_states(runtime_arg_config), - invocation, - allocator); - TaskSignatureAndImpl task_sig_impl = - task_registry.task_mapping.at(invocation.task_id); - auto fn = task_sig_impl.impl_function.get() - .function_ptr; - DeviceSpecificDeviceStates device_state = fn(accessor); - per_device_op_states.insert({node, device_state}); - } - } - - return LocalArgsBacking{runtime_arg_config, per_device_op_states}; -} - -std::optional call_task_impl(TaskRegistry const &task_registry, - task_id_t const &task_id, - TaskArgumentAccessor const &acc) { - TaskSignatureAndImpl task_sig_impl = task_registry.task_mapping.at(task_id); - auto fn = - task_sig_impl.impl_function.get().function_ptr; - return fn(acc); -} - -std::optional - execute_forward(LocalTrainingBacking const &local_training_backing, - layer_guid_t const &operator_node, - Allocator &allocator) { - if (registry_contains_task_for_layer(local_training_backing.task_registry, - operator_node, - OpTaskType::FWD)) { - - ComputationGraphOpAttrs attrs = - get_layer_attrs(local_training_backing.computation_graph, operator_node) - .op_attrs; - - std::optional device_state = - get_per_device_op_state_if_exists( - local_training_backing.local_args_backing, operator_node); - - TaskInvocation invocation = lower_to_task_invocation( - forward(attrs), - operator_node, - get_incoming_inputs(local_training_backing.computation_graph, - operator_node), - get_incoming_input_shapes(local_training_backing.computation_graph, - operator_node), - get_outgoing_tensors(local_training_backing.computation_graph, - operator_node), - get_incoming_weights(local_training_backing.computation_graph, - operator_node), - local_training_backing.local_tensor_backing.tensor_gradient_mapping, - device_state); - TaskArgumentAccessor accessor = - get_task_arg_accessor(local_training_backing.local_tensor_backing, - local_training_backing.local_args_backing, - invocation, - allocator); - return call_task_impl( - local_training_backing.task_registry, invocation.task_id, accessor); - } else { - return std::nullopt; - } -} - -void compute_loss(LocalTrainingBacking const &local_training_backing, - LossAttrs const &loss_attrs, - tensor_guid_t const &logit_tensor, - loss_tensor_t const &label_tensor, - Allocator &allocator) { - TaskInvocation loss_invocation = backward( - loss_attrs, - logit_tensor, - local_training_backing.local_tensor_backing.tensor_gradient_mapping.at( - logit_tensor), - label_tensor); - // TODO: https://github.com/flexflow/flexflow-train/issues/1442 - // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation)); - TaskArgumentAccessor loss_accessor = - get_task_arg_accessor(local_training_backing.local_tensor_backing, - local_training_backing.local_args_backing, - loss_invocation, - allocator); - TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl(); - loss_impl_fn.get().function_ptr(loss_accessor); -} - -std::optional - execute_backward(LocalTrainingBacking const &local_training_backing, - layer_guid_t const &operator_node, - Allocator &allocator) { - if (registry_contains_task_for_layer(local_training_backing.task_registry, - operator_node, - OpTaskType::BWD)) { - ComputationGraphOpAttrs attrs = - get_layer_attrs(local_training_backing.computation_graph, operator_node) - .op_attrs; - - std::optional device_state = - get_per_device_op_state_if_exists( - local_training_backing.local_args_backing, operator_node); - TaskInvocation invocation = lower_to_task_invocation( - backward(attrs), - operator_node, - get_incoming_inputs(local_training_backing.computation_graph, - operator_node), - get_incoming_input_shapes(local_training_backing.computation_graph, - operator_node), - get_outgoing_tensors(local_training_backing.computation_graph, - operator_node), - get_incoming_weights(local_training_backing.computation_graph, - operator_node), - local_training_backing.local_tensor_backing.tensor_gradient_mapping, - device_state); - TaskArgumentAccessor accessor = - get_task_arg_accessor(local_training_backing.local_tensor_backing, - local_training_backing.local_args_backing, - invocation, - allocator); - return call_task_impl( - local_training_backing.task_registry, invocation.task_id, accessor); - } else { - return std::nullopt; - } -} - -void execute_update(LocalTrainingBacking const &local_training_backing, - layer_guid_t const &node, - OptimizerAttrs const &optimizer_attrs, - Allocator &allocator) { - LayerAttrs layer_attrs = - get_layer_attrs(local_training_backing.computation_graph, node); - if (layer_attrs.op_attrs.has()) { - // get tensors - tensor_guid_t weight_tensor = get_only( - get_outgoing_tensors(local_training_backing.computation_graph, node)); - gradient_tensor_t weight_grad_tensor = - local_training_backing.local_tensor_backing.tensor_gradient_mapping.at( - weight_tensor); - std::vector optimizer_buffer_tensors = - local_training_backing.local_tensor_backing.tensor_optimizer_mapping.at( - weight_tensor); - - // get invocation - TaskInvocation invocation = get_update_invocation(optimizer_attrs, - weight_tensor, - weight_grad_tensor, - optimizer_buffer_tensors); - - // TODO: https://github.com/flexflow/flexflow-train/issues/1442 - // assert(is_invocation_valid(get_update_signature(attrs), invocation)); - - // execute update - TaskArgumentAccessor accessor = - get_task_arg_accessor(local_training_backing.local_tensor_backing, - local_training_backing.local_args_backing, - invocation, - allocator); - TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs); - update_impl_fn.get().function_ptr(accessor); - } -} - -TaskArgumentAccessor - get_task_arg_accessor(LocalTensorBacking const &local_tensor_backing, - LocalArgsBacking const &local_args_backing, - TaskInvocation const &invocation, - Allocator &allocator) { - TensorSlotsBacking tensor_slots_backing = - construct_tensor_slots_backing(local_tensor_backing, invocation.binding); - ArgSlotsBacking arg_slots_backing = construct_arg_slots_backing( - invocation.binding, local_args_backing.runtime_arg_config); - return TaskArgumentAccessor::create( - allocator, tensor_slots_backing, arg_slots_backing); -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/loss_tensor_source.cc b/lib/local-execution/src/loss_tensor_source.cc deleted file mode 100644 index f5ce639087..0000000000 --- a/lib/local-execution/src/loss_tensor_source.cc +++ /dev/null @@ -1,13 +0,0 @@ -#include "local-execution/loss_tensor_source.h" - -namespace FlexFlow { - -nonnegative_int LossTensorSource::next_available_loss_tensor_id = 0_n; - -LossTensorSource::LossTensorSource() {} - -loss_tensor_t LossTensorSource::new_loss_tensor() { - return loss_tensor_t{LossTensorSource::next_available_loss_tensor_id++}; -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/model_training_instance.cc b/lib/local-execution/src/model_training_instance.cc deleted file mode 100644 index d3c1c65a68..0000000000 --- a/lib/local-execution/src/model_training_instance.cc +++ /dev/null @@ -1,80 +0,0 @@ -#include "local-execution/model_training_instance.h" -#include "kernels/format_accessor_contents.h" -#include "pcg/computation_graph.h" -#include "pcg/optimizer_attrs.h" -#include "utils/containers/reversed.h" - -namespace FlexFlow { - -ModelTrainingInstance::ModelTrainingInstance( - Allocator const &allocator, - LocalTrainingBacking const &local_training_backing, - tensor_guid_t const &logit_tensor, - loss_tensor_t const &label_tensor, - LossAttrs const &loss_attrs, - OptimizerAttrs const &optimizer_attrs) - : allocator(allocator), training_backing(local_training_backing), - loss_attrs(loss_attrs), optimizer_attrs(optimizer_attrs), - logit_tensor(logit_tensor), label_tensor(label_tensor){}; - -PerLayerElapsedTime ModelTrainingInstance::forward() { - PerLayerElapsedTime per_layer_elapsed_time; - for (layer_guid_t const &node : - topological_ordering(this->training_backing.computation_graph)) { - std::optional elapsed_time = - execute_forward(this->training_backing, node, this->allocator); - per_layer_elapsed_time.insert({node, elapsed_time}); - } - return per_layer_elapsed_time; -} - -PerLayerElapsedTime ModelTrainingInstance::backward() { - compute_loss(this->training_backing, - this->loss_attrs, - this->logit_tensor, - this->label_tensor, - this->allocator); - - gradient_tensor_t loss_tensor = - this->training_backing.local_tensor_backing.tensor_gradient_mapping.at( - this->logit_tensor); - GenericTensorAccessorW loss_tensor_backing = - this->training_backing.local_tensor_backing.tensor_backings.at( - TensorTypeVariant{loss_tensor}); - - PerLayerElapsedTime per_layer_elapsed_time; - for (layer_guid_t const &node : reversed( - topological_ordering(this->training_backing.computation_graph))) { - std::optional elapsed_time = - execute_backward(this->training_backing, node, this->allocator); - per_layer_elapsed_time.insert({node, elapsed_time}); - } - return per_layer_elapsed_time; -} - -void ModelTrainingInstance::update() { - for (layer_guid_t const &node : - topological_ordering(this->training_backing.computation_graph)) { - execute_update( - this->training_backing, node, this->optimizer_attrs, this->allocator); - } - this->optimizer_attrs = - get_optimizer_attrs_for_next_iter(this->optimizer_attrs); -} - -GenericTensorAccessorR ModelTrainingInstance::get_loss_tensor_accessor() const { - GenericTensorAccessorW logit_tensor_backing = this->training_backing - .local_tensor_backing.tensor_backings.at(TensorTypeVariant{this->logit_tensor}); - - - gradient_tensor_t loss_tensor = - this->training_backing.local_tensor_backing.tensor_gradient_mapping.at( - this->logit_tensor); - GenericTensorAccessorW loss_tensor_backing = - this->training_backing.local_tensor_backing.tensor_backings.at( - TensorTypeVariant{loss_tensor}); - - return read_only_accessor_from_write_accessor(loss_tensor_backing); -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/task_binding.cc b/lib/local-execution/src/task_binding.cc index 4537493c1d..3894fb8d34 100644 --- a/lib/local-execution/src/task_binding.cc +++ b/lib/local-execution/src/task_binding.cc @@ -1,46 +1,60 @@ #include "task-spec/task_binding.h" #include "pcg/tensor_guid_t.dtg.h" +#include "task-spec/training_tensor_guid_t.dtg.h" #include "utils/containers/contains_key.h" #include "utils/fmt/unordered_map.h" +#include "utils/hash/tuple.h" #include "utils/hash/unordered_map.h" namespace FlexFlow { -void TaskBinding::bind(int name, tensor_guid_t const &binding) { +TaskBinding::TaskBinding() : tensor_bindings(), arg_bindings() {} + +TaskBinding::TaskBinding( + std::unordered_map const + &tensor_bindings, + std::unordered_map const &arg_bindings) + : tensor_bindings(tensor_bindings), arg_bindings(arg_bindings) {} + +void TaskBinding::bind(int name, forward_tensor_guid_t const &binding) { this->bind(slot_id_t{name}, binding); } -void TaskBinding::bind(slot_id_t name, tensor_guid_t const &binding) { - this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::FORWARD}, - TensorTypeVariant{binding}}); +void TaskBinding::bind(slot_id_t name, forward_tensor_guid_t const &binding) { + this->tensor_bindings.insert({tensor_sub_slot_id_t{name, TensorType::FORWARD}, + training_tensor_guid_t{binding}}); } -void TaskBinding::bind_grad(int name, gradient_tensor_t const &binding) { +void TaskBinding::bind_grad(int name, gradient_tensor_guid_t const &binding) { this->bind_grad(slot_id_t{name}, binding); } -void TaskBinding::bind_grad(slot_id_t name, gradient_tensor_t const &binding) { - this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::GRADIENT}, - TensorTypeVariant{binding}}); +void TaskBinding::bind_grad(slot_id_t name, + gradient_tensor_guid_t const &binding) { + this->tensor_bindings.insert( + {tensor_sub_slot_id_t{name, TensorType::GRADIENT}, + training_tensor_guid_t{binding}}); } -void TaskBinding::bind_optimizer(int name, optimizer_tensor_t const &binding) { +void TaskBinding::bind_optimizer(int name, + optimizer_tensor_guid_t const &binding) { this->bind_optimizer(slot_id_t{name}, binding); } void TaskBinding::bind_optimizer(slot_id_t name, - optimizer_tensor_t const &binding) { - this->tensor_bindings.insert({SlotTensorTypeId{name, TensorType::OPTIMIZER}, - TensorTypeVariant{binding}}); + optimizer_tensor_guid_t const &binding) { + this->tensor_bindings.insert( + {tensor_sub_slot_id_t{name, TensorType::OPTIMIZER}, + training_tensor_guid_t{binding}}); } -void TaskBinding::bind_loss(int name, loss_tensor_t const &binding) { +void TaskBinding::bind_loss(int name, loss_tensor_guid_t const &binding) { this->bind_loss(slot_id_t{name}, binding); } -void TaskBinding::bind_loss(slot_id_t name, loss_tensor_t const &binding) { - this->tensor_bindings.insert( - {SlotTensorTypeId{name, TensorType::LOSS}, TensorTypeVariant{binding}}); +void TaskBinding::bind_loss(slot_id_t name, loss_tensor_guid_t const &binding) { + this->tensor_bindings.insert({tensor_sub_slot_id_t{name, TensorType::LOSS}, + training_tensor_guid_t{binding}}); } void TaskBinding::insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec) { @@ -56,13 +70,14 @@ bool TaskBinding::operator!=(TaskBinding const &other) const { return this->tie() != other.tie(); } -std::tuple const &, - std::unordered_map const &> +std::tuple< + std::unordered_map const &, + std::unordered_map const &> TaskBinding::tie() const { return std::tie(this->tensor_bindings, this->arg_bindings); } -std::unordered_map const & +std::unordered_map const & TaskBinding::get_tensor_bindings() const { return this->tensor_bindings; } @@ -90,10 +105,7 @@ namespace std { size_t hash<::FlexFlow::TaskBinding>::operator()( ::FlexFlow::TaskBinding const &s) const { - size_t result = 0; - hash_combine(result, s.get_tensor_bindings()); - hash_combine(result, s.get_arg_bindings()); - return result; + return ::FlexFlow::get_std_hash(s.tie()); } } // namespace std diff --git a/lib/local-execution/src/task_registry.cc b/lib/local-execution/src/task_registry.cc deleted file mode 100644 index ae3d97daa4..0000000000 --- a/lib/local-execution/src/task_registry.cc +++ /dev/null @@ -1,78 +0,0 @@ -#include "local-execution/task_registry.h" -#include "pcg/computation_graph.h" -#include "task-spec/task_signature_impl.h" - -namespace FlexFlow { - -TaskRegistry construct_task_registry( - std::unordered_map const &layer_attrs_mapping) { - std::unordered_map> init_task_ids; - std::unordered_map> fwd_task_ids; - std::unordered_map> bwd_task_ids; - - std::unordered_map task_mapping; - - for (std::pair const &layer_attrs : - layer_attrs_mapping) { - layer_guid_t node = layer_attrs.first; - init_task_ids.insert({node, std::nullopt}); - fwd_task_ids.insert({node, std::nullopt}); - bwd_task_ids.insert({node, std::nullopt}); - - ComputationGraphOpAttrs attrs = layer_attrs.second.op_attrs; - std::vector task_ids = get_task_ids(attrs); - - for (task_id_t const &task_id : task_ids) { - TaskSignatureAndImpl task_signature_impl = get_task_sig_impl(task_id); - switch (task_signature_impl.task_signature.type) { - case OpTaskType::INIT: - assert(is_invocation_valid(task_signature_impl.task_signature, - init(attrs))); - init_task_ids[node] = task_id; - break; - case OpTaskType::FWD: - assert(is_invocation_valid(task_signature_impl.task_signature, - forward(attrs))); - fwd_task_ids[node] = task_id; - break; - case OpTaskType::BWD: - assert(is_invocation_valid(task_signature_impl.task_signature, - backward(attrs))); - bwd_task_ids[node] = task_id; - break; - default: - throw mk_runtime_error( - fmt::format("Invalid OpTaskType, got {}", - task_signature_impl.task_signature.type)); - } - task_mapping.insert({task_id, task_signature_impl}); - } - } - - return TaskRegistry{init_task_ids, fwd_task_ids, bwd_task_ids, task_mapping}; -} - -bool registry_contains_task_for_layer(TaskRegistry const &task_registry, - layer_guid_t const &op, - OpTaskType const &op_task_type) { - std::unordered_map> task_ids; - switch (op_task_type) { - case OpTaskType::INIT: - task_ids = task_registry.init_task_ids; - break; - case OpTaskType::FWD: - task_ids = task_registry.forward_task_ids; - break; - case OpTaskType::BWD: - task_ids = task_registry.backward_task_ids; - break; - default: - throw mk_runtime_error( - fmt::format("Invalid OpTaskType, got {}", op_task_type)); - } - - assert(task_ids.count(op)); - return task_ids.at(op).has_value(); -} - -} // namespace FlexFlow diff --git a/lib/local-execution/src/tracked_allocator.cc b/lib/local-execution/src/tracked_allocator.cc index ed181aea32..3ac7352e59 100644 --- a/lib/local-execution/src/tracked_allocator.cc +++ b/lib/local-execution/src/tracked_allocator.cc @@ -19,8 +19,8 @@ void TrackedAllocator::deallocate(void *ptr) { this->current_mem_usage -= psize; } -size_t TrackedAllocator::get_current_mem_usage() { - return this->current_mem_usage; +num_bytes_t TrackedAllocator::get_current_mem_usage() const { + return num_bytes_t{nonnegative_int{this->current_mem_usage}}; } DeviceType TrackedAllocator::get_allocation_device_type() const { diff --git a/lib/local-execution/src/unallocated_tensors.cc b/lib/local-execution/src/unallocated_tensors.cc deleted file mode 100644 index b8daa90e3b..0000000000 --- a/lib/local-execution/src/unallocated_tensors.cc +++ /dev/null @@ -1,92 +0,0 @@ -#include "local-execution/unallocated_tensors.h" -#include "local-execution/allocated_tensors.h" -#include "pcg/optimizer_attrs.h" - -namespace FlexFlow { - -UnallocatedTensors generate_unallocated_tensors( - AllocatedTensors const &allocated_tensors, - std::unordered_map const &tensor_attrs_mapping, - GradientTensorSource &gradient_tensor_source) { - - assert(are_allocated_tensors_valid(allocated_tensors, tensor_attrs_mapping)); - - std::unordered_map tensor_type_shapes; - std::unordered_map gradient_mapping; - - for (std::pair const &tensor_guid_attrs : - tensor_attrs_mapping) { - tensor_guid_t tensor_guid = tensor_guid_attrs.first; - TensorAttrs tensor_attrs = tensor_guid_attrs.second; - TensorTypeVariant tensor_guid_type = TensorTypeVariant{tensor_guid}; - if (!allocated_tensors.tensor_type_backings.count(tensor_guid_type)) { - tensor_type_shapes.insert({tensor_guid_type, tensor_attrs.shape}); - } - - if (tensor_attrs.create_grad == CreateGrad::YES && - !allocated_tensors.gradient_mapping.count(tensor_guid)) { - gradient_tensor_t gradient_tensor = - gradient_tensor_source.new_gradient_tensor(); - tensor_type_shapes.insert( - {TensorTypeVariant{gradient_tensor}, tensor_attrs.shape}); - gradient_mapping.insert({tensor_guid, gradient_tensor}); - } - } - - return UnallocatedTensors{tensor_type_shapes, gradient_mapping, {}}; -} - -UnallocatedTensors generate_unallocated_tensors_with_optimizer( - AllocatedTensors const &allocated_tensors, - std::unordered_map const &tensor_attrs_mapping, - GradientTensorSource &gradient_tensor_source, - OptimizerTensorSource &optimizer_tensor_source, - OptimizerAttrs const &optimizer_attrs) { - - UnallocatedTensors unallocated_tensors = generate_unallocated_tensors( - allocated_tensors, tensor_attrs_mapping, gradient_tensor_source); - - if (!get_num_optimizer_tensors(optimizer_attrs)) { - return unallocated_tensors; - } - - std::unordered_map tensor_type_shapes = - unallocated_tensors.tensor_type_shapes; - std::unordered_map gradient_mapping = - unallocated_tensors.gradient_mapping; - std::unordered_map> - optimizer_mapping; - - for (std::pair const &tensor_guid_attrs : - tensor_attrs_mapping) { - tensor_guid_t tensor_guid = tensor_guid_attrs.first; - TensorAttrs tensor_attrs = tensor_guid_attrs.second; - if (tensor_attrs.create_grad == CreateGrad::YES) { - std::vector optimizer_tensors; - - int num_optimizer_tensors_to_allocate = - get_num_optimizer_tensors(optimizer_attrs); - if (allocated_tensors.optimizer_mapping.count(tensor_guid)) { - num_optimizer_tensors_to_allocate -= - allocated_tensors.optimizer_mapping.at(tensor_guid).size(); - } - - for (int i = 0; i < num_optimizer_tensors_to_allocate; ++i) { - optimizer_tensor_t optimizer_tensor = - optimizer_tensor_source.new_optimizer_tensor(); - optimizer_tensors.push_back(optimizer_tensor); - tensor_type_shapes.insert( - {TensorTypeVariant{optimizer_tensor}, tensor_attrs.shape}); - } - - if (num_optimizer_tensors_to_allocate > 0) { - optimizer_mapping.insert({tensor_guid, optimizer_tensors}); - } - } - } - - return UnallocatedTensors{ - tensor_type_shapes, gradient_mapping, optimizer_mapping}; -} - -} // namespace FlexFlow diff --git a/lib/local-execution/test/src/test_utils.cc b/lib/local-execution/test/src/internal/test_utils.cc similarity index 94% rename from lib/local-execution/test/src/test_utils.cc rename to lib/local-execution/test/src/internal/test_utils.cc index b7a4e16b97..629640b6ae 100644 --- a/lib/local-execution/test/src/test_utils.cc +++ b/lib/local-execution/test/src/internal/test_utils.cc @@ -1,4 +1,4 @@ -#include "test_utils.h" +#include "internal/test_utils.h" #include "pcg/tensor_guid_t.dtg.h" namespace FlexFlow { diff --git a/lib/local-execution/test/src/test_utils.h b/lib/local-execution/test/src/internal/test_utils.h similarity index 100% rename from lib/local-execution/test/src/test_utils.h rename to lib/local-execution/test/src/internal/test_utils.h diff --git a/lib/local-execution/test/src/local-execution/local_cost_estimator.cc b/lib/local-execution/test/src/local-execution/local_cost_estimator.cc new file mode 100644 index 0000000000..107b835383 --- /dev/null +++ b/lib/local-execution/test/src/local-execution/local_cost_estimator.cc @@ -0,0 +1,142 @@ +#include "local-execution/local_cost_estimator.h" +#include "doctest/doctest.h" +#include "internal/test_utils.h" +#include "kernels/device_handle_t.h" +#include "kernels/managed_per_device_ff_handle.h" +#include "op-attrs/ops/attention.h" +#include "op-attrs/ops/cast.h" +#include "op-attrs/parallel_tensor_shape.h" +#include "pcg/computation_graph_builder.h" +#include "pcg/machine_view.h" +#include "task-spec/runtime_arg_config.h" + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("LocalCostEstimator") { + RuntimeArgConfig runtime_arg_config = + cpu_make_runtime_arg_config(EnableProfiling::YES, + ProfilingSettings{/*warmup_iters=*/0, + /*measure_iters=*/1}); + + OptimizerAttrs optimizer_attrs = OptimizerAttrs{ + SGDOptimizerAttrs{ + /*lr=*/0.1, + /*momentum=*/0.1, + /*nesterov=*/false, + /*weight_decay=*/0.1, + }, + }; + + CostEstimator cost_estimator = get_local_cost_estimator(runtime_arg_config); + + SUBCASE("estimate operator cost") { + CastAttrs attrs = CastAttrs{ + /*dtype=*/DataType::INT32, + }; + + ParallelTensorShape input_shape = lift_to_parallel(TensorShape{ + TensorDims{FFOrdered{40_p, 48_p, 36_p}}, + DataType::FLOAT, + }); + + ParallelTensorShape output_shape = + throw_if_unexpected(get_output_shape(attrs, input_shape)); + + OpCostEstimateKey op_cost_estimate_key = OpCostEstimateKey{ + /*op_attrs=*/PCGOperatorAttrs{attrs}, + /*input_shapes=*/{input_shape}, + /*weight_shapes=*/{}, + /*output_shapes=*/{output_shape}, + /*optimizer_attrs=*/optimizer_attrs, + /*machine_view=*/ + make_1d_machine_view( + MachineSpaceCoordinate{0_n, 0_n, DeviceType::CPU}, + MachineSpecificationDimension::INTRA_NODE, + stride_t{1_p}), + }; + + OpCostMetrics result = cost_estimator.estimate_cost(op_cost_estimate_key); + + CHECK(result.forward_runtime > 0_ms); + CHECK(result.backward_runtime > 0_ms); + CHECK(result.memory_usage > 0_bytes); + } + } +} + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("LocalCostEstimator (CUDA)") { + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); + + RuntimeArgConfig runtime_arg_config = + gpu_make_runtime_arg_config(managed_handle.raw_handle(), + EnableProfiling::YES, + ProfilingSettings{/*warmup_iters=*/0, + /*measure_iters=*/1}); + + OptimizerAttrs optimizer_attrs = OptimizerAttrs{ + SGDOptimizerAttrs{ + /*lr=*/0.1, + /*momentum=*/0.1, + /*nesterov=*/false, + /*weight_decay=*/0.1, + }, + }; + + CostEstimator cost_estimator = get_local_cost_estimator(runtime_arg_config); + + SUBCASE("estimate operator cost") { + positive_int embed_dim = 32_p; + positive_int num_heads = 10_p; + MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{ + /*embed_dim=*/embed_dim, + /*num_heads=*/num_heads, + /*kdim=*/embed_dim, + /*vdim=*/embed_dim, + /*dropout=*/0.0, + /*bias=*/false, + /*add_bias_kv=*/false, + /*add_zero_attn=*/false, + }; + + positive_int batch_size = 40_p; + positive_int seq_len = 48_p; + positive_int feature_size = 36_p; + + DataType dtype = DataType::FLOAT; + ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{ + TensorDims{ + FFOrdered{batch_size, seq_len, feature_size}}, + DataType::FLOAT, + }); + + ParallelTensorShape weights_shape = throw_if_unexpected( + get_weights_shape(attrs, inputs_shape, inputs_shape, inputs_shape)); + + ParallelTensorShape output_shape = throw_if_unexpected( + get_output_shape(attrs, inputs_shape, inputs_shape, inputs_shape)); + + OpCostEstimateKey op_cost_estimate_key = OpCostEstimateKey{ + /*op_attrs=*/PCGOperatorAttrs{attrs}, + /*input_shapes=*/{inputs_shape, inputs_shape, inputs_shape}, + /*weight_shapes=*/{weights_shape}, + /*output_shapes=*/{output_shape}, + /*optimizer_attrs=*/optimizer_attrs, + /*machine_view=*/ + make_1d_machine_view( + MachineSpaceCoordinate{0_n, 0_n, DeviceType::GPU}, + MachineSpecificationDimension::INTRA_NODE, + stride_t{1_p}), + }; + + OpCostMetrics result = cost_estimator.estimate_cost(op_cost_estimate_key); + + CHECK(result.forward_runtime > 0_ms); + CHECK(result.backward_runtime > 0_ms); + CHECK(result.memory_usage > 0_bytes); + } + } +} diff --git a/lib/local-execution/test/src/test_local_task_arg_accessor.cc b/lib/local-execution/test/src/local-execution/local_task_argument_accessor.cc similarity index 86% rename from lib/local-execution/test/src/test_local_task_arg_accessor.cc rename to lib/local-execution/test/src/local-execution/local_task_argument_accessor.cc index 5c11010e2a..482795b278 100644 --- a/lib/local-execution/test/src/test_local_task_arg_accessor.cc +++ b/lib/local-execution/test/src/local-execution/local_task_argument_accessor.cc @@ -1,6 +1,6 @@ +#include "local-execution/local_task_argument_accessor.h" #include "doctest/doctest.h" #include "kernels/local_cpu_allocator.h" -#include "local-execution/local_task_argument_accessor.h" #include "task-spec/task_signature_impl.h" #include "utils/fmt/variant.h" @@ -36,16 +36,33 @@ TEST_SUITE(FF_TEST_SUITE) { VARIADIC_TENSORS, }; - TensorSlotsBacking tensor_slots_backing = { - {SlotTensorTypeId{slot_id_t{INPUT}, TensorType::FORWARD}, input}, - {SlotTensorTypeId{slot_id_t{INPUT}, TensorType::GRADIENT}, input_grad}, - {SlotTensorTypeId{slot_id_t{VARIADIC_TENSORS}, TensorType::FORWARD}, - variadic_tensors}, - {SlotTensorTypeId{slot_id_t{VARIADIC_TENSORS}, TensorType::GRADIENT}, - variadic_tensors_grad}, - }; + std::unordered_map + tensor_slots_backing = { + { + tensor_sub_slot_id_t{slot_id_t{INPUT}, TensorType::FORWARD}, + TensorSlotBacking{input}, + }, + { + tensor_sub_slot_id_t{slot_id_t{INPUT}, TensorType::GRADIENT}, + TensorSlotBacking{input_grad}, + }, + { + tensor_sub_slot_id_t{slot_id_t{VARIADIC_TENSORS}, + TensorType::FORWARD}, + TensorSlotBacking{variadic_tensors}, + }, + { + tensor_sub_slot_id_t{slot_id_t{VARIADIC_TENSORS}, + TensorType::GRADIENT}, + TensorSlotBacking{variadic_tensors_grad}, + }, + }; - LocalTaskArgumentAccessor acc = {allocator, tensor_slots_backing, {}}; + LocalTaskArgumentAccessor acc = LocalTaskArgumentAccessor{ + /*allocator=*/allocator, + /*tensor_slots_backing=*/tensor_slots_backing, + /*arg_slots_backing=*/{}, + }; SUBCASE("get_tensor") { SUBCASE("get_tensor(slot_id_t, Permissions::RO, TensorType::FORWARD)") { @@ -55,6 +72,7 @@ TEST_SUITE(FF_TEST_SUITE) { slot_id_t{INPUT}, Permissions::RO, TensorType::FORWARD); CHECK(correct == result); } + SUBCASE("get_tensor(slot_id_t, Permissions::RO, TensorType::GRADIENT)") { GenericTensorAccessor correct = GenericTensorAccessor{ read_only_accessor_from_write_accessor(input_grad)}; @@ -62,24 +80,28 @@ TEST_SUITE(FF_TEST_SUITE) { slot_id_t{INPUT}, Permissions::RO, TensorType::GRADIENT); CHECK(correct == result); } + SUBCASE("get_tensor(slot_id_t, Permissions::WO, TensorType::FORWARD)") { GenericTensorAccessor correct = GenericTensorAccessor{input}; GenericTensorAccessor result = acc.get_tensor( slot_id_t{INPUT}, Permissions::WO, TensorType::FORWARD); CHECK(correct == result); } + SUBCASE("get_tensor(slot_id_t, Permissions::WO, TensorType::GRADIENT)") { GenericTensorAccessor correct = GenericTensorAccessor{input_grad}; GenericTensorAccessor result = acc.get_tensor( slot_id_t{INPUT}, Permissions::WO, TensorType::GRADIENT); CHECK(correct == result); } + SUBCASE("get_tensor(slot_id_t, Permissions::RW, TensorType::FORWARD)") { GenericTensorAccessor correct = GenericTensorAccessor{input}; GenericTensorAccessor result = acc.get_tensor( slot_id_t{INPUT}, Permissions::RW, TensorType::FORWARD); CHECK(correct == result); } + SUBCASE("get_tensor(slot_id_t, Permissions::RW, TensorType::GRADIENT)") { GenericTensorAccessor correct = GenericTensorAccessor{input_grad}; GenericTensorAccessor result = acc.get_tensor( @@ -100,6 +122,7 @@ TEST_SUITE(FF_TEST_SUITE) { slot_id_t{VARIADIC_TENSORS}, Permissions::RO, TensorType::FORWARD); CHECK(result == correct); } + SUBCASE("get_variadic_tensor(slot_id_t, Permissions::RO, " "TensorType::GRADIENT)") { VariadicGenericTensorAccessor correct = @@ -112,6 +135,7 @@ TEST_SUITE(FF_TEST_SUITE) { slot_id_t{VARIADIC_TENSORS}, Permissions::RO, TensorType::GRADIENT); CHECK(result == correct); } + SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, " "TensorType::FORWARD)") { VariadicGenericTensorAccessor correct = @@ -120,6 +144,7 @@ TEST_SUITE(FF_TEST_SUITE) { slot_id_t{VARIADIC_TENSORS}, Permissions::WO, TensorType::FORWARD); CHECK(result == correct); } + SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, " "TensorType::GRADIENT)") { VariadicGenericTensorAccessor correct = @@ -128,6 +153,7 @@ TEST_SUITE(FF_TEST_SUITE) { slot_id_t{VARIADIC_TENSORS}, Permissions::WO, TensorType::GRADIENT); CHECK(result == correct); } + SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, " "TensorType::FORWARD)") { VariadicGenericTensorAccessor correct = @@ -136,6 +162,7 @@ TEST_SUITE(FF_TEST_SUITE) { slot_id_t{VARIADIC_TENSORS}, Permissions::RW, TensorType::FORWARD); CHECK(result == correct); } + SUBCASE("get_variadic_tensor(slot_id_t, Permissions::WO, " "TensorType::GRADIENT)") { VariadicGenericTensorAccessor correct = diff --git a/lib/local-execution/test/src/local-execution/local_task_registry.cc b/lib/local-execution/test/src/local-execution/local_task_registry.cc new file mode 100644 index 0000000000..27cd74b2a6 --- /dev/null +++ b/lib/local-execution/test/src/local-execution/local_task_registry.cc @@ -0,0 +1,278 @@ +#include "local-execution/local_task_registry.h" +#include "kernels/local_cuda_allocator.h" +#include "local-execution/local_cost_estimator.h" +#include "local-execution/local_task_registry.dtg.h" +#include "local-execution/operator_task_set.h" +#include "local-execution/registered_task.h" +#include "pcg/computation_graph_builder.h" +#include "pcg/layer_guid_t.dtg.h" +#include "task-spec/task_signature_impl.h" +#include "utils/fmt/optional.h" +#include "utils/fmt/unordered_map.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("LocalTaskRegistry") { + layer_guid_t layer_guid = layer_guid_t{Node{0}}; + positive_int embed_dim = 32_p; + positive_int num_heads = 10_p; + ComputationGraphOpAttrs attrs = + ComputationGraphOpAttrs{MultiHeadAttentionAttrs{ + /*embed_dim=*/embed_dim, + /*num_heads=*/num_heads, + /*kdim=*/embed_dim, + /*vdim=*/embed_dim, + /*dropout=*/0.0, + /*bias=*/true, + /*add_bias_kv=*/false, + /*add_zero_attn=*/false, + }}; + + OperatorTaskSet mha_task_set = get_task_set_for_operator(attrs); + { + OperatorTaskSet expected_mha_task_set = OperatorTaskSet{ + /*init_task=*/registered_task_t{task_id_t::ATTENTION_INIT_TASK_ID}, + /*fwd_task=*/registered_task_t{task_id_t::ATTENTION_FWD_TASK_ID}, + /*bwd_task=*/registered_task_t{task_id_t::ATTENTION_BWD_TASK_ID}, + }; + REQUIRE(mha_task_set == expected_mha_task_set); + } + + std::unordered_map mha_task_mapping = { + {task_id_t::ATTENTION_INIT_TASK_ID, + get_task_signature_and_impl_for_task_id( + task_id_t::ATTENTION_INIT_TASK_ID)}, + {task_id_t::ATTENTION_FWD_TASK_ID, + get_task_signature_and_impl_for_task_id( + task_id_t::ATTENTION_FWD_TASK_ID)}, + {task_id_t::ATTENTION_BWD_TASK_ID, + get_task_signature_and_impl_for_task_id( + task_id_t::ATTENTION_BWD_TASK_ID)}, + }; + + SUBCASE("register single layer") { + LocalTaskRegistry task_registry = + construct_local_task_registry_for_layers( + {{layer_guid, LayerAttrs{attrs, std::nullopt}}}); + + LocalTaskRegistry correct_task_registry = [&] { + std::unordered_map task_sets = { + { + layer_guid, + mha_task_set, + }, + }; + + return LocalTaskRegistry{ + /*task_sets=*/{ + {layer_guid, mha_task_set}, + }, + /*task_mapping=*/mha_task_mapping, + }; + }(); + + CHECK(task_registry == correct_task_registry); + } + + SUBCASE("multiple layers same task") { + layer_guid_t other_layer_guid = layer_guid_t{Node{1}}; + LocalTaskRegistry task_registry = + construct_local_task_registry_for_layers({ + {layer_guid, LayerAttrs{attrs, std::nullopt}}, + {other_layer_guid, LayerAttrs{attrs, std::nullopt}}, + }); + + SUBCASE("layer to task ids") { + std::unordered_map correct = { + {layer_guid, mha_task_set}, + {other_layer_guid, mha_task_set}, + }; + CHECK(task_registry.task_sets == correct); + } + + SUBCASE("task to signature+impl mapping") { + std::unordered_map correct = + mha_task_mapping; + + CHECK(task_registry.task_mapping == correct); + } + } + + SUBCASE("different attrs, still same task fn mapping") { + layer_guid_t layer_1 = layer_guid_t{Node{1}}; + positive_int embed_dim = 100_p; + layer_guid_t layer_2 = layer_guid_t{Node{2}}; + ComputationGraphOpAttrs other_attrs = + ComputationGraphOpAttrs{MultiHeadAttentionAttrs{ + /*embed_dim=*/embed_dim, + /*num_heads=*/num_heads, + /*kdim=*/embed_dim, + /*vdim=*/embed_dim, + /*dropout=*/0.0, + /*bias=*/true, + /*add_bias_kv=*/false, + /*add_zero_attn=*/false, + }}; + LocalTaskRegistry task_registry = + construct_local_task_registry_for_layers({ + {layer_guid, LayerAttrs{attrs, std::nullopt}}, + {layer_1, LayerAttrs{attrs, std::nullopt}}, + {layer_2, LayerAttrs{other_attrs, std::nullopt}}, + }); + + std::unordered_map correct_task_mapping = + mha_task_mapping; + + CHECK(task_registry.task_mapping == correct_task_mapping); + } + + SUBCASE("equality") { + SUBCASE("different attrs is still equal") { + positive_int embed_dim = 100_p; + ComputationGraphOpAttrs other_attrs = + ComputationGraphOpAttrs{MultiHeadAttentionAttrs{ + /*embed_dim=*/embed_dim, + /*num_heads=*/num_heads, + /*kdim=*/embed_dim, + /*vdim=*/embed_dim, + /*dropout=*/0.0, + /*bias=*/true, + /*add_bias_kv=*/false, + /*add_zero_attn=*/false, + }}; + + LocalTaskRegistry task_registry = + construct_local_task_registry_for_layers( + {{layer_guid, LayerAttrs{attrs, std::nullopt}}}); + LocalTaskRegistry other_task_registry = + construct_local_task_registry_for_layers( + {{layer_guid, LayerAttrs{other_attrs, std::nullopt}}}); + + CHECK(task_registry == other_task_registry); + } + + SUBCASE("different layer_guid is not equal") { + LocalTaskRegistry task_registry = + construct_local_task_registry_for_layers( + {{layer_guid, LayerAttrs{attrs, std::nullopt}}}); + layer_guid_t other_layer_guid = layer_guid_t{Node{1}}; + LocalTaskRegistry other_task_registry = + construct_local_task_registry_for_layers( + {{other_layer_guid, LayerAttrs{attrs, std::nullopt}}}); + + CHECK(task_registry != other_task_registry); + } + } + + SUBCASE("try_get_registered_task") { + SUBCASE("Task exists") { + LocalTaskRegistry task_registry = + construct_local_task_registry_for_layers({ + {layer_guid, LayerAttrs{attrs, std::nullopt}}, + }); + + SUBCASE("Init") { + std::optional result = try_get_registered_task( + task_registry, layer_guid, OpTaskType::INIT); + std::optional correct = registered_task_t{ + task_id_t::ATTENTION_INIT_TASK_ID, + }; + + CHECK(result == correct); + } + + SUBCASE("Fwd") { + std::optional result = try_get_registered_task( + task_registry, layer_guid, OpTaskType::FWD); + std::optional correct = registered_task_t{ + task_id_t::ATTENTION_FWD_TASK_ID, + }; + + CHECK(result == correct); + } + + SUBCASE("Bwd") { + std::optional result = try_get_registered_task( + task_registry, layer_guid, OpTaskType::BWD); + std::optional correct = registered_task_t{ + task_id_t::ATTENTION_BWD_TASK_ID, + }; + + CHECK(result == correct); + } + } + + SUBCASE("Partial task does not exist") { + ComputationGraphOpAttrs bmm_attrs = ComputationGraphOpAttrs{ + BatchMatmulAttrs{/*a_seq_length_dim=*/10_n, + /*b_seq_length_dim=*/20_n}}; + LocalTaskRegistry task_registry = + construct_local_task_registry_for_layers({ + {layer_guid, LayerAttrs{bmm_attrs, std::nullopt}}, + }); + + SUBCASE("Init") { + std::optional result = try_get_registered_task( + task_registry, layer_guid, OpTaskType::INIT); + std::optional correct = + make_noop_registered_task(); + + CHECK(result == correct); + } + + SUBCASE("Fwd") { + std::optional result = try_get_registered_task( + task_registry, layer_guid, OpTaskType::FWD); + std::optional correct = registered_task_t{ + task_id_t::BATCHMATMUL_FWD_TASK_ID, + }; + + CHECK(result == correct); + } + + SUBCASE("Bwd") { + std::optional result = try_get_registered_task( + task_registry, layer_guid, OpTaskType::BWD); + std::optional correct = registered_task_t{ + task_id_t::BATCHMATMUL_BWD_TASK_ID, + }; + + CHECK(result == correct); + } + } + + SUBCASE("Empty tasks") { + LocalTaskRegistry task_registry = LocalTaskRegistry{ + /*task_sets=*/{}, + /*task_mapping=*/{}, + }; + + SUBCASE("Init") { + std::optional result = try_get_registered_task( + task_registry, layer_guid, OpTaskType::INIT); + std::optional correct = std::nullopt; + + CHECK(result == correct); + } + + SUBCASE("Fwd") { + std::optional result = try_get_registered_task( + task_registry, layer_guid, OpTaskType::FWD); + std::optional correct = std::nullopt; + + CHECK(result == correct); + } + + SUBCASE("Bwd") { + std::optional result = try_get_registered_task( + task_registry, layer_guid, OpTaskType::BWD); + std::optional correct = std::nullopt; + + CHECK(result == correct); + } + } + } + } +} diff --git a/lib/local-execution/test/src/local-execution/local_tensor_backing.cc b/lib/local-execution/test/src/local-execution/local_tensor_backing.cc new file mode 100644 index 0000000000..2f5bf493d6 --- /dev/null +++ b/lib/local-execution/test/src/local-execution/local_tensor_backing.cc @@ -0,0 +1,285 @@ +#include "local-execution/local_tensor_backing.h" +#include "internal/test_utils.h" +#include "kernels/local_cpu_allocator.h" +#include "task-spec/gradient_tensor_source.h" +#include "task-spec/loss_tensor_source.h" +#include "task-spec/optimizer_tensor_source.h" +#include "test/utils/doctest/check_kv.h" +#include "test/utils/doctest/fmt/unordered_map.h" +#include "utils/containers/keys.h" +#include + +using namespace ::FlexFlow; + +bool is_shape_and_dtype_equal_for_tensor_backings( + LocalTensorBacking const &b1, LocalTensorBacking const &b2) { + + std::unordered_map m1 = + b1.backing_for_training_tensor_map; + std::unordered_map m2 = + b2.backing_for_training_tensor_map; + + if (keys(m1) == keys(m2)) { + for (std::pair const + &tensor_type_backing : m1) { + if (tensor_type_backing.second.shape == + m2.at(tensor_type_backing.first).shape) { + continue; + } else { + return false; + } + } + return true; + } else { + return false; + } +} + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("construct_local_tensor_backing") { + Allocator allocator = create_local_cpu_memory_allocator(); + + training_tensor_guid_t t1 = + training_tensor_guid_t{forward_tensor_guid_t{4}}; + training_tensor_guid_t t2 = + training_tensor_guid_t{gradient_tensor_guid_t{4}}; + training_tensor_guid_t t3 = + training_tensor_guid_t{gradient_tensor_guid_t{5}}; + training_tensor_guid_t t4 = + training_tensor_guid_t{gradient_tensor_guid_t{6}}; + + TensorShape tensor_shape_1 = TensorShape{ + TensorDims{FFOrdered{ + 4_p, + 5_p, + }}, + DataType::FLOAT, + }; + + TensorShape tensor_shape_2 = TensorShape{ + TensorDims{FFOrdered{ + 4_p, + 5_p, + }}, + DataType::FLOAT, + }; + + std::unordered_map + training_tensor_shapes = { + {t1, tensor_shape_1}, + {t2, tensor_shape_2}, + {t3, tensor_shape_1}, + }; + + GenericTensorAccessorW t3_accessor = + allocator.allocate_tensor(tensor_shape_2); + SUBCASE("allocates all non-preallocated tensors and does not re-allocate " + "the preallocated ones") { + std::unordered_map + preallocated_tensors = { + {t3, t3_accessor}, + }; + + LocalTensorBacking result = construct_local_tensor_backing( + /*training_tensor_shapes=*/training_tensor_shapes, + /*preallocated_tensors=*/preallocated_tensors, + /*allocator=*/allocator); + LocalTensorBacking correct = LocalTensorBacking{ + /*backing_for_training_tensor_map=*/{ + {t3, t3_accessor}, + {t1, allocator.allocate_tensor(tensor_shape_1)}, + {t2, allocator.allocate_tensor(tensor_shape_2)}, + }, + }; + + CHECK_MESSAGE( + is_shape_and_dtype_equal_for_tensor_backings(result, correct), + check_kv("result", fmt::to_string(result)), + check_kv("correct", fmt::to_string(correct))); + + CHECK(get_accessor_for_training_tensor(result, t3) == t3_accessor); + } + + SUBCASE("fails if a preallocated tensor is not in training_tensor_shapes") { + std::unordered_map + preallocated_tensors = { + {t4, t3_accessor}, + }; + + CHECK_THROWS(construct_local_tensor_backing( + /*training_tensor_shapes=*/training_tensor_shapes, + /*preallocated_tensors=*/preallocated_tensors, + /*allocator=*/allocator)); + } + } + + TEST_CASE("get_accessor_for_training_tensor") { + Allocator allocator = create_local_cpu_memory_allocator(); + + TensorShape tensor_shape = TensorShape{ + TensorDims{FFOrdered{ + 4_p, + 5_p, + }}, + DataType::FLOAT, + }; + + training_tensor_guid_t t1 = + training_tensor_guid_t{forward_tensor_guid_t{4}}; + training_tensor_guid_t t2 = + training_tensor_guid_t{gradient_tensor_guid_t{4}}; + + GenericTensorAccessorW t1_accessor = + allocator.allocate_tensor(tensor_shape); + GenericTensorAccessorW t2_accessor = + allocator.allocate_tensor(tensor_shape); + + LocalTensorBacking local_tensor_backing = LocalTensorBacking{ + /*backing_for_training_tensor_map=*/{{ + t1, + t1_accessor, + }, + { + t2, + t2_accessor, + }}, + }; + + SUBCASE("returns corresponding accessor if training tensor is present") { + GenericTensorAccessorW result = + get_accessor_for_training_tensor(local_tensor_backing, t1); + GenericTensorAccessorW correct = t1_accessor; + + CHECK(result == correct); + } + + SUBCASE("fails if the training tensor is not present") { + training_tensor_guid_t t3 = + training_tensor_guid_t{optimizer_tensor_guid_t{4}}; + training_tensor_guid_t t4 = + training_tensor_guid_t{forward_tensor_guid_t{3}}; + + CHECK_THROWS(get_accessor_for_training_tensor(local_tensor_backing, t3)); + CHECK_THROWS(get_accessor_for_training_tensor(local_tensor_backing, t4)); + } + } + + TEST_CASE("construct_tensor_slots_backing_for_binding") { + enum Slots { + TENSOR_SLOT_1, + TENSOR_SLOT_2, + TENSOR_SLOT_3, + ARG_SLOT, + }; + + Allocator allocator = create_local_cpu_memory_allocator(); + + TensorShape tensor_shape = TensorShape{ + TensorDims{FFOrdered{ + 4_p, + 5_p, + }}, + DataType::FLOAT, + }; + + training_tensor_guid_t t1 = + training_tensor_guid_t{forward_tensor_guid_t{4}}; + training_tensor_guid_t t2 = + training_tensor_guid_t{forward_tensor_guid_t{5}}; + training_tensor_guid_t t3 = + training_tensor_guid_t{forward_tensor_guid_t{6}}; + training_tensor_guid_t t4 = + training_tensor_guid_t{gradient_tensor_guid_t{5}}; + + GenericTensorAccessorW t1_accessor = + allocator.allocate_tensor(tensor_shape); + GenericTensorAccessorW t2_accessor = + allocator.allocate_tensor(tensor_shape); + GenericTensorAccessorW t3_accessor = + allocator.allocate_tensor(tensor_shape); + GenericTensorAccessorW t4_accessor = + allocator.allocate_tensor(tensor_shape); + + tensor_sub_slot_id_t tensor_slot_1_forward = tensor_sub_slot_id_t{ + slot_id_t{TENSOR_SLOT_1}, + TensorType::FORWARD, + }; + tensor_sub_slot_id_t tensor_slot_1_gradient = tensor_sub_slot_id_t{ + slot_id_t{TENSOR_SLOT_1}, + TensorType::GRADIENT, + }; + tensor_sub_slot_id_t tensor_slot_2_forward = tensor_sub_slot_id_t{ + slot_id_t{TENSOR_SLOT_2}, + TensorType::FORWARD, + }; + tensor_sub_slot_id_t tensor_slot_3_forward = tensor_sub_slot_id_t{ + slot_id_t{TENSOR_SLOT_3}, + TensorType::FORWARD, + }; + + LocalTensorBacking local_tensor_backing = LocalTensorBacking{ + /*backing_for_training_tensor_map=*/{{ + t1, + t1_accessor, + }, + { + t2, + t2_accessor, + }, + { + t3, + t3_accessor, + }, + { + t4, + t4_accessor, + }}, + }; + + TaskBinding task_binding = TaskBinding{ + /*tensor_bindings=*/{ + { + tensor_slot_1_forward, + t1, + }, + { + tensor_slot_2_forward, + t2, + }, + { + tensor_slot_1_gradient, + t4, + }, + }, + /*arg_bindings=*/ + { + { + slot_id_t{ARG_SLOT}, + TaskArgSpec{ + ConcreteArgSpec::create(4), + }, + }, + }, + }; + + std::unordered_map result = + construct_tensor_slots_backing_for_binding(local_tensor_backing, + task_binding); + std::unordered_map correct = { + { + tensor_slot_1_forward, + TensorSlotBacking{t1_accessor}, + }, + { + tensor_slot_2_forward, + TensorSlotBacking{t2_accessor}, + }, + { + tensor_slot_1_gradient, + TensorSlotBacking{t4_accessor}, + }, + }; + + CHECK(result == correct); + } +} diff --git a/lib/local-execution/test/src/test_update.cc b/lib/local-execution/test/src/local-execution/local_training_backing.cc similarity index 68% rename from lib/local-execution/test/src/test_update.cc rename to lib/local-execution/test/src/local-execution/local_training_backing.cc index 54c64e6b6c..5436dbdbb7 100644 --- a/lib/local-execution/test/src/test_update.cc +++ b/lib/local-execution/test/src/local-execution/local_training_backing.cc @@ -1,18 +1,23 @@ -#include "doctest/doctest.h" +#include "local-execution/local_training_backing.h" +#include "internal/test_utils.h" #include "kernels/local_cuda_allocator.h" #include "kernels/managed_ff_stream.h" #include "kernels/managed_per_device_ff_handle.h" -#include "local-execution/allocated_tensors.h" -#include "local-execution/local_training_backing.h" #include "pcg/computation_graph.h" #include "pcg/computation_graph_builder.h" #include "pcg/optimizer_attrs.dtg.h" -#include "test_utils.h" +#include "task-spec/forward_tensor_source.h" +#include "task-spec/gradient_tensor_source.h" +#include "task-spec/optimizer_tensor_source.h" +#include "task-spec/runtime_arg_config.h" +#include "task-spec/training_computation_graph.h" +#include "utils/containers/get_only.h" +#include using namespace ::FlexFlow; TEST_SUITE(FF_CUDA_TEST_SUITE) { - TEST_CASE("ExecuteUpdate") { + TEST_CASE("execute_update") { // initialize runtime configs ManagedFFStream managed_stream{}; ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( @@ -20,7 +25,6 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { /*allowTensorOpMathConversion=*/true); Allocator allocator = create_local_cuda_memory_allocator(); - AllocatedTensors allocated_tensors = make_empty_allocated_tensors(); // construct computation graph ComputationGraph computation_graph = make_empty_computation_graph(); @@ -56,14 +60,35 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { "linear"}, inputs_layer.outputs, weights_layer.outputs); + tensor_guid_t logit_tensor = get_only(linear_operator.outputs); - RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ - DeviceSpecific::create(managed_handle.raw_handle()), + RuntimeArgConfig runtime_arg_config = gpu_make_runtime_arg_config( + managed_handle.raw_handle(), EnableProfiling::YES, - ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}}; + ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}); + ForwardTensorSource forward_tensor_source; GradientTensorSource gradient_tensor_source; OptimizerTensorSource optimizer_tensor_source; + LossTensorSource loss_tensor_source; + + auto make_training_backing = [&](OptimizerAttrs const &optimizer_attrs) { + TrainingComputationGraph training_computation_graph = + generate_training_computation_graph(computation_graph, + optimizer_attrs, + logit_tensor, + forward_tensor_source, + gradient_tensor_source, + optimizer_tensor_source, + loss_tensor_source); + + return make_local_training_backing_for_computation_graph( + /*allocator=*/allocator, + /*preallocated_tensors=*/{}, + /*training_computation_graph=*/training_computation_graph, + /*runtime_arg_config=*/runtime_arg_config, + /*optimizer_attrs=*/optimizer_attrs); + }; SUBCASE("SGDOptimizerAttrs") { SUBCASE("momentum=0") { @@ -72,39 +97,27 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { /*momentum=*/0.0f, /*nesterov=*/false, /*weight_decay=*/0.001}}; - LocalTrainingBacking local_training_backing = - LocalTrainingBacking{allocator, - allocated_tensors, - gradient_tensor_source, - optimizer_tensor_source, - computation_graph, - runtime_arg_config, - optimizer_attrs}; - execute_update(local_training_backing, + + execute_update(make_training_backing(optimizer_attrs), linear_operator.layer, optimizer_attrs, allocator); } + SUBCASE("momentum=0.9") { OptimizerAttrs optimizer_attrs = OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, /*momentum=*/0.9, /*nesterov=*/false, /*weight_decay=*/0.001}}; - LocalTrainingBacking local_training_backing = - LocalTrainingBacking{allocator, - allocated_tensors, - gradient_tensor_source, - optimizer_tensor_source, - computation_graph, - runtime_arg_config, - optimizer_attrs}; - execute_update(local_training_backing, + + execute_update(make_training_backing(optimizer_attrs), linear_operator.layer, optimizer_attrs, allocator); } } + SUBCASE("AdamOptimizerAttrs") { OptimizerAttrs optimizer_attrs = OptimizerAttrs{AdamOptimizerAttrs{/*alpha=*/0.001, @@ -115,15 +128,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { /*beta_t=*/0.9, /*beta2_t=*/0.999, /*epsilon=*/1e-8}}; - LocalTrainingBacking local_training_backing = - LocalTrainingBacking{allocator, - allocated_tensors, - gradient_tensor_source, - optimizer_tensor_source, - computation_graph, - runtime_arg_config, - optimizer_attrs}; - execute_update(local_training_backing, + execute_update(make_training_backing(optimizer_attrs), linear_operator.layer, optimizer_attrs, allocator); diff --git a/lib/local-execution/test/src/test_loss_functions.cc b/lib/local-execution/test/src/local-execution/loss_functions.cc similarity index 54% rename from lib/local-execution/test/src/test_loss_functions.cc rename to lib/local-execution/test/src/local-execution/loss_functions.cc index d741d4d8d4..e5fffb980c 100644 --- a/lib/local-execution/test/src/test_loss_functions.cc +++ b/lib/local-execution/test/src/local-execution/loss_functions.cc @@ -1,14 +1,19 @@ #include "doctest/doctest.h" +#include "internal/test_utils.h" #include "kernels/local_cuda_allocator.h" #include "kernels/managed_ff_stream.h" #include "kernels/managed_per_device_ff_handle.h" -#include "local-execution/allocated_tensors.h" #include "local-execution/local_training_backing.h" #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" #include "pcg/computation_graph.h" #include "pcg/computation_graph_builder.h" #include "pcg/optimizer_attrs.dtg.h" -#include "test_utils.h" +#include "task-spec/forward_tensor_source.h" +#include "task-spec/gradient_tensor_source.h" +#include "task-spec/loss_tensor_source.h" +#include "task-spec/optimizer_tensor_source.h" +#include "task-spec/runtime_arg_config.h" +#include "task-spec/training_computation_graph.h" #include "utils/containers/get_only.h" using namespace ::FlexFlow; @@ -23,34 +28,10 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); - // allocate label tensors - LossTensorSource loss_tensor_source; - loss_tensor_t label_for_nonconfigurable_loss_attrs = - loss_tensor_source.new_loss_tensor(); - loss_tensor_t label_for_sparse_cce_loss_attrs = - loss_tensor_source.new_loss_tensor(); - positive_int batch_size = 10_p; positive_int data_dim = 16_p; positive_int output_dim = 32_p; - TensorShape output_tensor_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT}; - TensorShape reduced_tensor_shape = - TensorShape{TensorDims{FFOrdered{batch_size, 1_p}}, DataType::FLOAT}; - - GenericTensorAccessorW label_for_nonconfigurable_loss_attrs_backing = - allocator.allocate_tensor(output_tensor_shape); - GenericTensorAccessorW label_for_sparse_cce_loss_attrs_backing = - allocator.allocate_tensor(reduced_tensor_shape); - AllocatedTensors allocated_tensors = AllocatedTensors{ - {{TensorTypeVariant{label_for_nonconfigurable_loss_attrs}, - label_for_nonconfigurable_loss_attrs_backing}, - {TensorTypeVariant{label_for_sparse_cce_loss_attrs}, - label_for_sparse_cce_loss_attrs_backing}}, - {}, - {}}; - // construct computation graph ComputationGraph computation_graph = make_empty_computation_graph(); @@ -83,60 +64,92 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { weights_layer.outputs); tensor_guid_t logit_tensor = get_only(linear_operator.outputs); - RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ - DeviceSpecific::create(managed_handle.raw_handle()), + RuntimeArgConfig runtime_arg_config = gpu_make_runtime_arg_config( + managed_handle.raw_handle(), EnableProfiling::YES, - ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}}; - - // initialize training backing + ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}); + + OptimizerAttrs optimizer_attrs = OptimizerAttrs{ + SGDOptimizerAttrs{ + /*lr=*/0.0, + /*momentum=*/0.0, + /*nesterov=*/false, + /*weight_decay=*/0.0, + }, + }; + + ForwardTensorSource forward_tensor_source; GradientTensorSource gradient_tensor_source; - LocalTrainingBacking local_training_backing = - LocalTrainingBacking{allocator, - allocated_tensors, - gradient_tensor_source, - computation_graph, - runtime_arg_config}; + OptimizerTensorSource optimizer_tensor_source; + LossTensorSource loss_tensor_source; + + TrainingComputationGraph training_computation_graph = + generate_training_computation_graph(computation_graph, + optimizer_attrs, + logit_tensor, + forward_tensor_source, + gradient_tensor_source, + optimizer_tensor_source, + loss_tensor_source); + + auto make_training_backing = [&](TensorShape const &label_tensor_shape) { + GenericTensorAccessorW label_tensor_accessor = + allocator.allocate_tensor(label_tensor_shape); + + return make_local_training_backing_for_computation_graph( + /*allocator=*/allocator, + /*preallocated_tensors=*/ + { + { + training_tensor_guid_t{ + training_computation_graph.label_tensor}, + label_tensor_accessor, + }, + }, + /*training_computation_graph=*/training_computation_graph, + /*runtime_arg_config=*/runtime_arg_config, + /*optimizer_attrs=*/optimizer_attrs); + }; SUBCASE("SparseCategoricalCrossEntropyLossAttrs") { + TensorShape label_tensor_shape = + TensorShape{TensorDims{FFOrdered{batch_size, 1_p}}, DataType::FLOAT}; + + LocalTrainingBacking local_training_backing = + make_training_backing(label_tensor_shape); + LossAttrs loss_attrs = LossAttrs{ SparseCategoricalCrossEntropyLossAttrs{/*replace_labels=*/false}}; - compute_loss(local_training_backing, - loss_attrs, - logit_tensor, - label_for_sparse_cce_loss_attrs, - allocator); + compute_loss(local_training_backing, loss_attrs, allocator); } SUBCASE("NonconfigurableLossAttrs") { + TensorShape label_tensor_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT}; + + LocalTrainingBacking local_training_backing = + make_training_backing(label_tensor_shape); + SUBCASE("LossFunction::CATEGORICAL_CROSSENTROPY") { LossAttrs loss_attrs = LossAttrs{ NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}; - compute_loss(local_training_backing, - loss_attrs, - logit_tensor, - label_for_nonconfigurable_loss_attrs, - allocator); + + compute_loss(local_training_backing, loss_attrs, allocator); } SUBCASE("LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE") { LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{ LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE}}; - compute_loss(local_training_backing, - loss_attrs, - logit_tensor, - label_for_nonconfigurable_loss_attrs, - allocator); + + compute_loss(local_training_backing, loss_attrs, allocator); } SUBCASE("LossFunction::IDENTITY") { LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{LossFunction::IDENTITY}}; - compute_loss(local_training_backing, - loss_attrs, - logit_tensor, - label_for_nonconfigurable_loss_attrs, - allocator); + + compute_loss(local_training_backing, loss_attrs, allocator); } } } diff --git a/lib/local-execution/test/src/test_allocated_tensors.cc b/lib/local-execution/test/src/test_allocated_tensors.cc deleted file mode 100644 index 3242ca79ad..0000000000 --- a/lib/local-execution/test/src/test_allocated_tensors.cc +++ /dev/null @@ -1,226 +0,0 @@ -#include "kernels/local_cpu_allocator.h" -#include "local-execution/allocated_tensors.h" -#include "local-execution/gradient_tensor_source.h" -#include "local-execution/loss_tensor_source.h" -#include "local-execution/optimizer_tensor_source.h" -#include "pcg/computation_graph.dtg.h" -#include "test/utils/doctest/fmt/pair.h" -#include "test/utils/doctest/fmt/unordered_map.h" -#include "test/utils/doctest/fmt/variant.h" -#include "test/utils/doctest/fmt/vector.h" -#include "test_utils.h" -#include - -using namespace ::FlexFlow; - -TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("AllocatedTensors") { - MockTensorGuidSource tensor_guid_source; - GradientTensorSource gradient_tensor_source; - OptimizerTensorSource optimizer_tensor_source; - LossTensorSource loss_tensor_source; - - Allocator allocator = create_local_cpu_memory_allocator(); - - tensor_guid_t mock_tensor_1 = tensor_guid_source.new_mock_tensor_guid(); - tensor_guid_t mock_tensor_2 = tensor_guid_source.new_mock_tensor_guid(); - tensor_guid_t mock_tensor_3_with_grad = - tensor_guid_source.new_mock_tensor_guid(); - tensor_guid_t dangling_tensor = tensor_guid_source.new_mock_tensor_guid(); - - TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_p, 10_p}}, DataType::FLOAT}, - CreateGrad::NO}; - TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_p, 20_p}}, DataType::FLOAT}, - CreateGrad::NO}; - TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_p, 30_p}}, DataType::FLOAT}, - CreateGrad::YES}; - - GenericTensorAccessorW tensor_backing_1 = - allocator.allocate_tensor(tensor_attrs_1_no_grad.shape); - GenericTensorAccessorW tensor_backing_2 = - allocator.allocate_tensor(tensor_attrs_2_no_grad.shape); - GenericTensorAccessorW tensor_backing_3 = - allocator.allocate_tensor(tensor_attrs_3_with_grad.shape); - - std::unordered_map tensor_attrs_mapping = { - {mock_tensor_1, tensor_attrs_1_no_grad}, - {mock_tensor_2, tensor_attrs_2_no_grad}, - {mock_tensor_3_with_grad, tensor_attrs_3_with_grad}, - }; - - SUBCASE("Trivial tensors") { - SUBCASE("Empty") { - AllocatedTensors allocated_tensors = AllocatedTensors{{}, {}, {}}; - bool result = are_allocated_tensors_valid(allocated_tensors, - tensor_attrs_mapping); - CHECK(result == true); - } - - SUBCASE("Loss tensor") { - loss_tensor_t loss_tensor = loss_tensor_source.new_loss_tensor(); - AllocatedTensors allocated_tensors = AllocatedTensors{ - {{TensorTypeVariant{loss_tensor}, tensor_backing_1}}, {}, {}}; - bool result = are_allocated_tensors_valid(allocated_tensors, - tensor_attrs_mapping); - CHECK(result == true); - } - } - - SUBCASE("Forward tensors") { - SUBCASE("Correct forward tensor") { - AllocatedTensors allocated_tensors = AllocatedTensors{ - {{TensorTypeVariant{mock_tensor_1}, tensor_backing_1}}, {}, {}}; - bool result = are_allocated_tensors_valid(allocated_tensors, - tensor_attrs_mapping); - CHECK(result == true); - } - - SUBCASE("Incorrect forward tensor") { - AllocatedTensors allocated_tensors = AllocatedTensors{ - {{TensorTypeVariant{mock_tensor_1}, tensor_backing_2}}, {}, {}}; - bool result = are_allocated_tensors_valid(allocated_tensors, - tensor_attrs_mapping); - CHECK(result == false); - } - - SUBCASE("Dangling tensor guid") { - AllocatedTensors allocated_tensors = AllocatedTensors{ - { - {TensorTypeVariant{dangling_tensor}, tensor_backing_1}, - }, - {}, - {}}; - bool result = are_allocated_tensors_valid(allocated_tensors, - tensor_attrs_mapping); - CHECK(result == false); - } - } - - SUBCASE("Gradient tensors") { - gradient_tensor_t grad_tensor_3 = - gradient_tensor_source.new_gradient_tensor(); - - SUBCASE("Gradient tensor") { - AllocatedTensors allocated_tensors = AllocatedTensors{ - {{TensorTypeVariant{grad_tensor_3}, tensor_backing_3}}, - {{mock_tensor_3_with_grad, grad_tensor_3}}, - {}}; - bool result = are_allocated_tensors_valid(allocated_tensors, - tensor_attrs_mapping); - CHECK(result == true); - } - - SUBCASE("Dangling gradient tensor") { - AllocatedTensors allocated_tensors = AllocatedTensors{ - {{TensorTypeVariant{grad_tensor_3}, tensor_backing_3}}, {}, {}}; - bool result = are_allocated_tensors_valid(allocated_tensors, - tensor_attrs_mapping); - CHECK(result == false); - } - - SUBCASE("Dangling gradient tensor in mapping") { - AllocatedTensors allocated_tensors = AllocatedTensors{ - {}, {{mock_tensor_3_with_grad, grad_tensor_3}}, {}}; - bool result = are_allocated_tensors_valid(allocated_tensors, - tensor_attrs_mapping); - CHECK(result == false); - } - - SUBCASE("Gradient allocated for forward tensor without gradient") { - AllocatedTensors allocated_tensors = AllocatedTensors{ - {{TensorTypeVariant{grad_tensor_3}, tensor_backing_3}}, - {{mock_tensor_2, grad_tensor_3}}, - {}}; - bool result = are_allocated_tensors_valid(allocated_tensors, - tensor_attrs_mapping); - CHECK(result == false); - } - - SUBCASE("Gradient tensor with wrong shape") { - AllocatedTensors allocated_tensors = AllocatedTensors{ - {{TensorTypeVariant{grad_tensor_3}, tensor_backing_2}}, - {{mock_tensor_3_with_grad, grad_tensor_3}}, - {}}; - bool result = are_allocated_tensors_valid(allocated_tensors, - tensor_attrs_mapping); - CHECK(result == false); - } - - SUBCASE("Gradient tensor with dangling tensor guid") { - AllocatedTensors allocated_tensors = AllocatedTensors{ - {{TensorTypeVariant{grad_tensor_3}, tensor_backing_3}}, - {{dangling_tensor, grad_tensor_3}}, - {}}; - bool result = are_allocated_tensors_valid(allocated_tensors, - tensor_attrs_mapping); - CHECK(result == false); - } - } - - SUBCASE("Optimizer tensors") { - optimizer_tensor_t optimizer_tensor_3 = - optimizer_tensor_source.new_optimizer_tensor(); - - SUBCASE("Optimizer tensor") { - AllocatedTensors allocated_tensors = AllocatedTensors{ - {{TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3}}, - {}, - {{mock_tensor_3_with_grad, {optimizer_tensor_3}}}}; - bool result = are_allocated_tensors_valid(allocated_tensors, - tensor_attrs_mapping); - CHECK(result == true); - } - - SUBCASE("Dangling optimizer tensor") { - AllocatedTensors allocated_tensors = AllocatedTensors{ - {{TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3}}, - {}, - {}}; - bool result = are_allocated_tensors_valid(allocated_tensors, - tensor_attrs_mapping); - CHECK(result == false); - } - - SUBCASE("Dangling optimizer tensor in mapping") { - AllocatedTensors allocated_tensors = AllocatedTensors{ - {}, {}, {{mock_tensor_3_with_grad, {optimizer_tensor_3}}}}; - bool result = are_allocated_tensors_valid(allocated_tensors, - tensor_attrs_mapping); - CHECK(result == false); - } - - SUBCASE("Optimizer allocated for forward tensor without gradient") { - AllocatedTensors allocated_tensors = AllocatedTensors{ - {{TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3}}, - {}, - {{mock_tensor_2, {optimizer_tensor_3}}}}; - bool result = are_allocated_tensors_valid(allocated_tensors, - tensor_attrs_mapping); - CHECK(result == false); - } - - SUBCASE("Optimizer tensor with wrong shape") { - AllocatedTensors allocated_tensors = AllocatedTensors{ - {{TensorTypeVariant{optimizer_tensor_3}, tensor_backing_2}}, - {}, - {{mock_tensor_3_with_grad, {optimizer_tensor_3}}}}; - bool result = are_allocated_tensors_valid(allocated_tensors, - tensor_attrs_mapping); - CHECK(result == false); - } - - SUBCASE("Optimizer tensor with dangling tensor guid") { - AllocatedTensors allocated_tensors = AllocatedTensors{ - {{TensorTypeVariant{optimizer_tensor_3}, tensor_backing_3}}, - {}, - {{dangling_tensor, {optimizer_tensor_3}}}}; - bool result = are_allocated_tensors_valid(allocated_tensors, - tensor_attrs_mapping); - CHECK(result == false); - } - } - } -} diff --git a/lib/local-execution/test/src/test_e2e.cc b/lib/local-execution/test/src/test_e2e.cc index f1c83e76a0..f8d34fc5ff 100644 --- a/lib/local-execution/test/src/test_e2e.cc +++ b/lib/local-execution/test/src/test_e2e.cc @@ -1,15 +1,25 @@ +#include "internal/test_utils.h" #include "kernels/compare_tensor_accessors.h" +#include "kernels/copy_tensor_accessor.h" #include "kernels/format_accessor_contents.h" +#include "kernels/local_cpu_allocator.h" +#include "kernels/local_cuda_allocator.h" +#include "kernels/managed_ff_stream.h" +#include "kernels/managed_per_device_ff_handle.h" #include "kernels/tensor_accessor_reductions.h" -#include "kernels/test_utils.h" -#include "local-execution/allocated_tensors.h" #include "local-execution/local_training_backing.h" #include "local-execution/model_training_instance.h" #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" #include "pcg/computation_graph.h" #include "pcg/computation_graph_builder.h" #include "pcg/optimizer_attrs.dtg.h" -#include "test_utils.h" +#include "task-spec/forward_tensor_source.h" +#include "task-spec/gradient_tensor_source.h" +#include "task-spec/loss_tensor_source.h" +#include "task-spec/optimizer_tensor_source.h" +#include "task-spec/runtime_arg_config.h" +#include "task-spec/training_computation_graph.h" +#include "test/utils/doctest/check_kv.h" #include "utils/containers/get_only.h" #include @@ -23,8 +33,139 @@ bool did_loss_decrease(GenericTensorAccessorR const &first_epoch, compare_tensor_accessors_le(last_epoch, first_epoch, cpu_allocator)); } -TEST_SUITE(FF_CUDA_TEST_SUITE) { +TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("LocalBackend e2e Training") { + Allocator allocator = create_local_cpu_memory_allocator(); + + positive_int batch_size = 10_p; + positive_int data_dim = 16_p; + positive_int hidden_dim = 32_p; + positive_int output_dim = 1_p; + + TensorShape output_tensor_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT}; + + GenericTensorAccessorW label_tensor_backing = + allocator.allocate_tensor(output_tensor_shape); + + // construct computation graph + ComputationGraph computation_graph = make_empty_computation_graph(); + + TensorShape input_tensor_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; + + TensorShape weight_shape_1 = TensorShape{ + TensorDims{FFOrdered{hidden_dim, data_dim}}, DataType::FLOAT}; + TensorShape weight_shape_2 = TensorShape{ + TensorDims{FFOrdered{output_dim, hidden_dim}}, DataType::FLOAT}; + + LayerAddedResult inputs_layer = + add_input_layer_with_grad(computation_graph, input_tensor_shape); + + LayerAddedResult weights_layer_1 = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{ + weight_shape_1, InitializerAttrs{GlorotNormalAttrs{0}}}}, + std::nullopt}, + {}, + {}); + + LayerAddedResult weights_layer_2 = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{ + weight_shape_2, InitializerAttrs{GlorotNormalAttrs{0}}}}, + std::nullopt}, + {}, + {}); + + LayerAddedResult linear_operator_1 = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{hidden_dim, + /*use_bias=*/false, + DataType::FLOAT, + Activation::RELU, + std::nullopt}}, + std::nullopt}, + inputs_layer.outputs, + weights_layer_1.outputs); + + LayerAddedResult linear_operator_2 = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim, + /*use_bias=*/false, + DataType::FLOAT, + Activation::RELU, + std::nullopt}}, + std::nullopt}, + linear_operator_1.outputs, + weights_layer_2.outputs); + + tensor_guid_t logit_tensor = get_only(linear_operator_2.outputs); + + RuntimeArgConfig runtime_arg_config = cpu_make_runtime_arg_config( + EnableProfiling::YES, + ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}); + + // initialize training backing + LossAttrs loss_attrs = LossAttrs{ + NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}; + OptimizerAttrs optimizer_attrs = + OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, + /*momentum=*/0.9, + /*nesterov=*/false, + /*weight_decay=*/0.001}}; + + ForwardTensorSource forward_tensor_source; + GradientTensorSource gradient_tensor_source; + OptimizerTensorSource optimizer_tensor_source; + LossTensorSource loss_tensor_source; + + TrainingComputationGraph training_computation_graph = + generate_training_computation_graph(computation_graph, + optimizer_attrs, + logit_tensor, + forward_tensor_source, + gradient_tensor_source, + optimizer_tensor_source, + loss_tensor_source); + + LocalTrainingBacking local_training_backing = + make_local_training_backing_for_computation_graph( + /*allocator=*/allocator, + /*preallocated_tensors=*/{}, + /*training_computation_graph=*/training_computation_graph, + /*runtime_arg_config=*/runtime_arg_config, + /*optimizer_attrs=*/optimizer_attrs); + + // begin training loop + ModelTrainingInstance model_training_instance = ModelTrainingInstance{ + allocator, local_training_backing, loss_attrs, optimizer_attrs}; + + int num_epochs = 5; + std::vector loss_values; + + for (int i = 0; i < num_epochs; i++) { + model_training_instance.forward(); + model_training_instance.backward(); + model_training_instance.update(); + loss_values.push_back(copy_tensor_accessor_r( + model_training_instance.get_loss_tensor_accessor(), allocator)); + } + + // Assert that each sample in the batch has a lower loss in last epoch than + // the first epoch + GenericTensorAccessorR first_epoch_loss = loss_values.at(0); + GenericTensorAccessorR last_epoch_loss = loss_values.back(); + CHECK_MESSAGE(did_loss_decrease(first_epoch_loss, last_epoch_loss), + check_kv("first_epoch_loss", + format_accessor_r_contents(first_epoch_loss)), + check_kv("last_epoch_loss", + format_accessor_r_contents(last_epoch_loss))); + } +} + +TEST_SUITE(FF_CUDA_TEST_SUITE) { + TEST_CASE("LocalBackend e2e Training (CUDA)") { // initialize runtime ManagedFFStream managed_stream{}; ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( @@ -33,42 +174,30 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { Allocator allocator = create_local_cuda_memory_allocator(); - // allocate label tensors - LossTensorSource loss_tensor_source; - loss_tensor_t label_tensor = loss_tensor_source.new_loss_tensor(); - positive_int batch_size = 10_p; positive_int data_dim = 16_p; positive_int hidden_dim = 32_p; positive_int output_dim = 1_p; - TensorShape input_tensor_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; TensorShape output_tensor_shape = TensorShape{ TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT}; - GenericTensorAccessorW label_tensor_backing = create_random_filled_accessor_w( - output_tensor_shape, allocator); + GenericTensorAccessorW label_tensor_backing = + allocator.allocate_tensor(output_tensor_shape); // construct computation graph ComputationGraph computation_graph = make_empty_computation_graph(); + TensorShape input_tensor_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; TensorShape weight_shape_1 = TensorShape{ TensorDims{FFOrdered{data_dim, hidden_dim}}, DataType::FLOAT}; TensorShape weight_shape_2 = TensorShape{ TensorDims{FFOrdered{hidden_dim, output_dim}}, DataType::FLOAT}; - GenericTensorAccessorW weight_1_backing = create_random_filled_accessor_w( - weight_shape_1, allocator); - GenericTensorAccessorW weight_2_backing = create_random_filled_accessor_w( - weight_shape_2, allocator); - LayerAddedResult inputs_layer = add_input_layer_with_grad(computation_graph, input_tensor_shape); - tensor_guid_t input_tensor_guid = get_only(inputs_layer.outputs); - GenericTensorAccessorW input_tensor_backing = create_random_filled_accessor_w( - input_tensor_shape, allocator); LayerAddedResult weights_layer_1 = add_layer( computation_graph, @@ -77,7 +206,6 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { std::nullopt}, {}, {}); - tensor_guid_t weight_1_tensor_guid = get_only(weights_layer_1.outputs); LayerAddedResult weights_layer_2 = add_layer( computation_graph, @@ -86,7 +214,6 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { std::nullopt}, {}, {}); - tensor_guid_t weight_2_tensor_guid = get_only(weights_layer_2.outputs); LayerAddedResult linear_operator_1 = add_layer( computation_graph, @@ -112,51 +239,55 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { tensor_guid_t logit_tensor = get_only(linear_operator_2.outputs); - RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ - DeviceSpecific::create(managed_handle.raw_handle()), + RuntimeArgConfig runtime_arg_config = gpu_make_runtime_arg_config( + managed_handle.raw_handle(), EnableProfiling::YES, - ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}}; + ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}); // initialize training backing LossAttrs loss_attrs = LossAttrs{ NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}; - OptimizerAttrs optimizer_attrs = - OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, - /*momentum=*/0.9, - /*nesterov=*/false, - /*weight_decay=*/0.001}}; + OptimizerAttrs optimizer_attrs = OptimizerAttrs{ + SGDOptimizerAttrs{ + /*lr=*/0.001, + /*momentum=*/0.9, + /*nesterov=*/false, + /*weight_decay=*/0.001, + }, + }; + ForwardTensorSource forward_tensor_source; GradientTensorSource gradient_tensor_source; OptimizerTensorSource optimizer_tensor_source; + LossTensorSource loss_tensor_source; - AllocatedTensors allocated_tensors = AllocatedTensors{ - /*tensor_type_backings=*/{ - {TensorTypeVariant{label_tensor}, label_tensor_backing}, - {TensorTypeVariant{input_tensor_guid}, input_tensor_backing}, - {TensorTypeVariant{weight_1_tensor_guid}, weight_1_backing}, - {TensorTypeVariant{weight_2_tensor_guid}, weight_2_backing}, - }, - /*gradient_mapping=*/{}, - /*optimizer_mapping*/ {}, - }; + TrainingComputationGraph training_computation_graph = + generate_training_computation_graph(computation_graph, + optimizer_attrs, + logit_tensor, + forward_tensor_source, + gradient_tensor_source, + optimizer_tensor_source, + loss_tensor_source); LocalTrainingBacking local_training_backing = - LocalTrainingBacking{allocator, - allocated_tensors, - gradient_tensor_source, - optimizer_tensor_source, - computation_graph, - runtime_arg_config, - optimizer_attrs}; + make_local_training_backing_for_computation_graph( + /*allocator=*/allocator, + /*preallocated_tensors=*/ + { + { + training_tensor_guid_t{ + training_computation_graph.label_tensor}, + label_tensor_backing, + }, + }, + /*training_computation_graph=*/training_computation_graph, + /*runtime_arg_config=*/runtime_arg_config, + /*optimizer_attrs=*/optimizer_attrs); // begin training loop - ModelTrainingInstance model_training_instance = - ModelTrainingInstance{allocator, - local_training_backing, - logit_tensor, - label_tensor, - loss_attrs, - optimizer_attrs}; + ModelTrainingInstance model_training_instance = ModelTrainingInstance{ + allocator, local_training_backing, loss_attrs, optimizer_attrs}; Allocator cpu_allocator = create_local_cpu_memory_allocator(); @@ -174,9 +305,7 @@ TEST_SUITE(FF_CUDA_TEST_SUITE) { // Assert that each sample in the batch has a lower loss in last epoch than // the first epoch GenericTensorAccessorR first_epoch_loss = loss_values.at(0); - GenericTensorAccessorR last_epoch = loss_values.back(); - CHECK(did_loss_decrease(first_epoch_loss, last_epoch)); } } diff --git a/lib/local-execution/test/src/test_local_cost_estimator.cc b/lib/local-execution/test/src/test_local_cost_estimator.cc deleted file mode 100644 index 42b88aa6bc..0000000000 --- a/lib/local-execution/test/src/test_local_cost_estimator.cc +++ /dev/null @@ -1,76 +0,0 @@ -#include "doctest/doctest.h" -#include "kernels/local_cuda_allocator.h" -#include "kernels/managed_per_device_ff_handle.h" -#include "local-execution/local_cost_estimator.h" -#include "op-attrs/ops/attention.h" -#include "op-attrs/parallel_tensor_shape.h" -#include "pcg/computation_graph_builder.h" -#include "test_utils.h" - -using namespace ::FlexFlow; - -TEST_SUITE(FF_CUDA_TEST_SUITE) { - TEST_CASE("LocalCostEstimator") { - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true); - - RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ - DeviceSpecific::create(managed_handle.raw_handle()), - EnableProfiling::YES, - ProfilingSettings{/*warmup_iters=*/0, - /*measure_iters=*/1}}; - - LocalCostEstimator cost_estimator = LocalCostEstimator{runtime_arg_config}; - - SUBCASE("Estimate cost -- Attention Op") { - positive_int embed_dim = 32_p; - positive_int num_heads = 10_p; - MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{ - /*embed_dim=*/embed_dim, - /*num_heads=*/num_heads, - /*kdim=*/embed_dim, - /*vdim=*/embed_dim, - /*dropout=*/0.0, - /*bias=*/false, - /*add_bias_kv=*/false, - /*add_zero_attn=*/false, - }; - - positive_int batch_size = 40_p; - positive_int seq_len = 48_p; - positive_int feature_size = 36_p; - - DataType dtype = DataType::FLOAT; - ParallelTensorShape inputs_shape = lift_to_parallel(TensorShape{ - TensorDims{ - FFOrdered{batch_size, seq_len, feature_size}}, - DataType::FLOAT, - }); - - ParallelTensorShape weights_shape = throw_if_unexpected( - get_weights_shape(attrs, inputs_shape, inputs_shape, inputs_shape)); - ParallelTensorAttrs weight_attrs = - ParallelTensorAttrs{weights_shape, CreateGrad::YES}; - - ParallelTensorShape output_shape = throw_if_unexpected( - get_output_shape(attrs, inputs_shape, inputs_shape, inputs_shape)); - ParallelTensorAttrs output_attrs = - ParallelTensorAttrs{output_shape, CreateGrad::YES}; - - CostDetails result = cost_estimator.estimate_cost( - PCGOperatorAttrs{attrs}, - std::vector{ - inputs_shape, inputs_shape, inputs_shape}, - std::vector{weight_attrs}, - std::vector{output_attrs}, - make_1d_machine_view( - MachineSpaceCoordinate{0_n, 0_n, DeviceType::GPU}, - MachineSpecificationDimension::INTRA_NODE, - stride_t{1_p})); - - CHECK(result.total_elapsed_time > 0); - CHECK(result.total_mem_usage > 0); - } - } -} diff --git a/lib/local-execution/test/src/test_local_tensor_backing.cc b/lib/local-execution/test/src/test_local_tensor_backing.cc deleted file mode 100644 index bba0bd28ce..0000000000 --- a/lib/local-execution/test/src/test_local_tensor_backing.cc +++ /dev/null @@ -1,146 +0,0 @@ -#include "kernels/local_cpu_allocator.h" -#include "local-execution/local_tensor_backing.h" -#include "test_utils.h" -#include "utils/containers/keys.h" -#include - -using namespace ::FlexFlow; - -bool is_shape_and_dtype_equal_for_tensor_backings( - std::unordered_map const &m1, - std::unordered_map const &m2) { - if (keys(m1) == keys(m2)) { - for (std::pair const - &tensor_type_backing : m1) { - if (is_shape_and_dtype_equal(tensor_type_backing.second, - m2.at(tensor_type_backing.first))) { - continue; - } else { - return false; - } - } - return true; - } else { - return false; - } -} - -TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("LocalTensorBacking") { - MockTensorGuidSource tensor_guid_source; - GradientTensorSource gradient_tensor_source; - OptimizerTensorSource optimizer_tensor_source; - LossTensorSource loss_tensor_source; - - SUBCASE("merge_optimizer_mappings") { - SUBCASE("Both empty") { - std::unordered_map> - result = merge_optimizer_mappings({}, {}); - std::unordered_map> - correct = {}; - CHECK(result == correct); - } - - tensor_guid_t allocated_tensor_guid = - tensor_guid_source.new_mock_tensor_guid(); - optimizer_tensor_t optimizer_tensor_1 = - optimizer_tensor_source.new_optimizer_tensor(); - optimizer_tensor_t optimizer_tensor_2 = - optimizer_tensor_source.new_optimizer_tensor(); - std::unordered_map> - correct = {{allocated_tensor_guid, - {optimizer_tensor_1, optimizer_tensor_2}}}; - SUBCASE("Unallocated is empty") { - std::unordered_map> - allocated = {{allocated_tensor_guid, - {optimizer_tensor_1, optimizer_tensor_2}}}; - std::unordered_map> - result = merge_optimizer_mappings(allocated, {}); - CHECK(result == correct); - } - SUBCASE("Allocated is empty") { - std::unordered_map> - unallocated = {{allocated_tensor_guid, - {optimizer_tensor_1, optimizer_tensor_2}}}; - std::unordered_map> - result = merge_optimizer_mappings({}, unallocated); - CHECK(result == correct); - } - - SUBCASE("Both are partially allocated") { - std::unordered_map> - allocated = {{allocated_tensor_guid, {optimizer_tensor_1}}}; - std::unordered_map> - unallocated = {{allocated_tensor_guid, {optimizer_tensor_2}}}; - std::unordered_map> - result = merge_optimizer_mappings(allocated, unallocated); - CHECK(result == correct); - } - } - - SUBCASE("get_tensor_backings") { - Allocator allocator = create_local_cpu_memory_allocator(); - SUBCASE("Both are empty") { - std::unordered_map result = - get_tensor_backings({}, {}, allocator); - std::unordered_map correct = - {}; - CHECK(result == correct); - } - - tensor_guid_t allocated_tensor_guid = - tensor_guid_source.new_mock_tensor_guid(); - tensor_guid_t unallocated_tensor_guid = - tensor_guid_source.new_mock_tensor_guid(); - - TensorAttrs allocated_tensor_attrs = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_p, 10_p}}, DataType::FLOAT}, - CreateGrad::NO}; - TensorAttrs unallocated_tensor_attrs = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_p, 20_p}}, DataType::FLOAT}, - CreateGrad::YES}; - - GenericTensorAccessorW allocated_tensor_backing = - allocator.allocate_tensor(allocated_tensor_attrs.shape); - GenericTensorAccessorW unallocated_tensor_backing = - allocator.allocate_tensor(unallocated_tensor_attrs.shape); - - SUBCASE("Unallocated is empty") { - std::unordered_map - allocated = {{TensorTypeVariant{allocated_tensor_guid}, - allocated_tensor_backing}}; - std::unordered_map result = - get_tensor_backings(allocated, {}, allocator); - CHECK(result == allocated); - } - SUBCASE("Allocated is empty") { - std::unordered_map unallocated = { - {TensorTypeVariant{unallocated_tensor_guid}, - unallocated_tensor_attrs.shape}}; - std::unordered_map result = - get_tensor_backings({}, unallocated, allocator); - std::unordered_map correct = - {{TensorTypeVariant{unallocated_tensor_guid}, - unallocated_tensor_backing}}; - CHECK(is_shape_and_dtype_equal_for_tensor_backings(result, correct)); - } - SUBCASE("Both are partially allocated") { - std::unordered_map - allocated = {{TensorTypeVariant{allocated_tensor_guid}, - allocated_tensor_backing}}; - std::unordered_map unallocated = { - {TensorTypeVariant{unallocated_tensor_guid}, - unallocated_tensor_attrs.shape}}; - - std::unordered_map result = - get_tensor_backings(allocated, unallocated, allocator); - std::unordered_map correct = - {{TensorTypeVariant{allocated_tensor_guid}, - allocated_tensor_backing}, - {TensorTypeVariant{unallocated_tensor_guid}, - unallocated_tensor_backing}}; - CHECK(is_shape_and_dtype_equal_for_tensor_backings(result, correct)); - } - } - } -} diff --git a/lib/local-execution/test/src/test_task_registry.cc b/lib/local-execution/test/src/test_task_registry.cc deleted file mode 100644 index 4bcfa7fe17..0000000000 --- a/lib/local-execution/test/src/test_task_registry.cc +++ /dev/null @@ -1,216 +0,0 @@ -#include "doctest/doctest.h" -#include "kernels/local_cuda_allocator.h" -#include "local-execution/local_cost_estimator.h" -#include "pcg/computation_graph_builder.h" -#include "task-spec/task_signature_impl.h" -#include "utils/fmt/optional.h" -#include "utils/fmt/unordered_map.h" - -using namespace ::FlexFlow; - -TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("TaskRegistry") { - - layer_guid_t layer_guid = layer_guid_t{Node{0}}; - positive_int embed_dim = 32_p; - positive_int num_heads = 10_p; - ComputationGraphOpAttrs attrs = - ComputationGraphOpAttrs{MultiHeadAttentionAttrs{ - /*embed_dim=*/embed_dim, - /*num_heads=*/num_heads, - /*kdim=*/embed_dim, - /*vdim=*/embed_dim, - /*dropout=*/0.0, - /*bias=*/true, - /*add_bias_kv=*/false, - /*add_zero_attn=*/false, - }}; - - SUBCASE("register single layer") { - TaskRegistry task_registry = construct_task_registry( - {{layer_guid, LayerAttrs{attrs, std::nullopt}}}); - - TaskRegistry correct_task_registry = [&] { - std::unordered_map> - init_task_ids = {{layer_guid, task_id_t::ATTENTION_INIT_TASK_ID}}; - std::unordered_map> - fwd_task_ids = {{layer_guid, task_id_t::ATTENTION_FWD_TASK_ID}}; - std::unordered_map> - bwd_task_ids = {{layer_guid, task_id_t::ATTENTION_BWD_TASK_ID}}; - std::unordered_map task_mapping = { - {task_id_t::ATTENTION_INIT_TASK_ID, - get_task_sig_impl(task_id_t::ATTENTION_INIT_TASK_ID)}, - {task_id_t::ATTENTION_FWD_TASK_ID, - get_task_sig_impl(task_id_t::ATTENTION_FWD_TASK_ID)}, - {task_id_t::ATTENTION_BWD_TASK_ID, - get_task_sig_impl(task_id_t::ATTENTION_BWD_TASK_ID)}}; - return TaskRegistry{ - init_task_ids, fwd_task_ids, bwd_task_ids, task_mapping}; - }(); - - CHECK(task_registry == correct_task_registry); - } - - SUBCASE("multiple layers same task") { - layer_guid_t other_layer_guid = layer_guid_t{Node{1}}; - TaskRegistry task_registry = construct_task_registry({ - {layer_guid, LayerAttrs{attrs, std::nullopt}}, - {other_layer_guid, LayerAttrs{attrs, std::nullopt}}, - }); - - SUBCASE("layer to task ids") { - std::unordered_map> correct = { - {layer_guid, task_id_t::ATTENTION_INIT_TASK_ID}, - {other_layer_guid, task_id_t::ATTENTION_INIT_TASK_ID}, - }; - CHECK(correct == task_registry.init_task_ids); - } - - SUBCASE("task to signature+impl mapping") { - std::unordered_map - correct_task_mapping = { - {task_id_t::ATTENTION_INIT_TASK_ID, - get_task_sig_impl(task_id_t::ATTENTION_INIT_TASK_ID)}, - {task_id_t::ATTENTION_FWD_TASK_ID, - get_task_sig_impl(task_id_t::ATTENTION_FWD_TASK_ID)}, - {task_id_t::ATTENTION_BWD_TASK_ID, - get_task_sig_impl(task_id_t::ATTENTION_BWD_TASK_ID)}}; - CHECK(correct_task_mapping == task_registry.task_mapping); - } - } - SUBCASE("different attrs, still same task fn mapping") { - layer_guid_t layer_1 = layer_guid_t{Node{1}}; - positive_int embed_dim = 100_p; - layer_guid_t layer_2 = layer_guid_t{Node{2}}; - ComputationGraphOpAttrs other_attrs = - ComputationGraphOpAttrs{MultiHeadAttentionAttrs{ - /*embed_dim=*/embed_dim, - /*num_heads=*/num_heads, - /*kdim=*/embed_dim, - /*vdim=*/embed_dim, - /*dropout=*/0.0, - /*bias=*/true, - /*add_bias_kv=*/false, - /*add_zero_attn=*/false, - }}; - TaskRegistry task_registry = construct_task_registry({ - {layer_guid, LayerAttrs{attrs, std::nullopt}}, - {layer_1, LayerAttrs{attrs, std::nullopt}}, - {layer_2, LayerAttrs{other_attrs, std::nullopt}}, - }); - - std::unordered_map correct_task_mapping = - {{task_id_t::ATTENTION_INIT_TASK_ID, - get_task_sig_impl(task_id_t::ATTENTION_INIT_TASK_ID)}, - {task_id_t::ATTENTION_FWD_TASK_ID, - get_task_sig_impl(task_id_t::ATTENTION_FWD_TASK_ID)}, - {task_id_t::ATTENTION_BWD_TASK_ID, - get_task_sig_impl(task_id_t::ATTENTION_BWD_TASK_ID)}}; - - CHECK(correct_task_mapping == task_registry.task_mapping); - } - - SUBCASE("equality") { - SUBCASE("different attrs is still equal") { - positive_int embed_dim = 100_p; - ComputationGraphOpAttrs other_attrs = - ComputationGraphOpAttrs{MultiHeadAttentionAttrs{ - /*embed_dim=*/embed_dim, - /*num_heads=*/num_heads, - /*kdim=*/embed_dim, - /*vdim=*/embed_dim, - /*dropout=*/0.0, - /*bias=*/true, - /*add_bias_kv=*/false, - /*add_zero_attn=*/false, - }}; - - TaskRegistry task_registry = construct_task_registry( - {{layer_guid, LayerAttrs{attrs, std::nullopt}}}); - TaskRegistry other_task_registry = construct_task_registry( - {{layer_guid, LayerAttrs{other_attrs, std::nullopt}}}); - - CHECK(task_registry == other_task_registry); - } - - SUBCASE("different layer_guid is not equal") { - TaskRegistry task_registry = construct_task_registry( - {{layer_guid, LayerAttrs{attrs, std::nullopt}}}); - layer_guid_t other_layer_guid = layer_guid_t{Node{1}}; - TaskRegistry other_task_registry = construct_task_registry( - {{other_layer_guid, LayerAttrs{attrs, std::nullopt}}}); - - CHECK(task_registry != other_task_registry); - } - } - - SUBCASE("registry_contains_task_for_layer") { - SUBCASE("Task exists") { - TaskRegistry task_registry = construct_task_registry({ - {layer_guid, LayerAttrs{attrs, std::nullopt}}, - }); - SUBCASE("Init") { - bool result = registry_contains_task_for_layer( - task_registry, layer_guid, OpTaskType::INIT); - CHECK(result == true); - } - SUBCASE("Fwd") { - bool result = registry_contains_task_for_layer( - task_registry, layer_guid, OpTaskType::FWD); - CHECK(result == true); - } - SUBCASE("Bwd") { - bool result = registry_contains_task_for_layer( - task_registry, layer_guid, OpTaskType::BWD); - CHECK(result == true); - } - } - - SUBCASE("Partial task does not exist") { - ComputationGraphOpAttrs bmm_attrs = ComputationGraphOpAttrs{ - BatchMatmulAttrs{/*a_seq_length_dim=*/10_n, - /*b_seq_length_dim=*/20_n}}; - TaskRegistry task_registry = construct_task_registry({ - {layer_guid, LayerAttrs{bmm_attrs, std::nullopt}}, - }); - SUBCASE("Init") { - bool result = registry_contains_task_for_layer( - task_registry, layer_guid, OpTaskType::INIT); - CHECK(result == false); - } - SUBCASE("Fwd") { - bool result = registry_contains_task_for_layer( - task_registry, layer_guid, OpTaskType::FWD); - CHECK(result == true); - } - SUBCASE("Bwd") { - bool result = registry_contains_task_for_layer( - task_registry, layer_guid, OpTaskType::BWD); - CHECK(result == true); - } - } - - SUBCASE("Empty tasks") { - std::unordered_map> - empty_task_ids = {{layer_guid, std::nullopt}}; - TaskRegistry task_registry = - TaskRegistry{empty_task_ids, empty_task_ids, empty_task_ids, {}}; - SUBCASE("Init") { - bool result = registry_contains_task_for_layer( - task_registry, layer_guid, OpTaskType::INIT); - CHECK(result == false); - } - SUBCASE("Fwd") { - bool result = registry_contains_task_for_layer( - task_registry, layer_guid, OpTaskType::FWD); - CHECK(result == false); - } - SUBCASE("Bwd") { - bool result = registry_contains_task_for_layer( - task_registry, layer_guid, OpTaskType::BWD); - CHECK(result == false); - } - } - } - } -} diff --git a/lib/local-execution/test/src/test_unallocated_tensors.cc b/lib/local-execution/test/src/test_unallocated_tensors.cc deleted file mode 100644 index 0a0b99e61c..0000000000 --- a/lib/local-execution/test/src/test_unallocated_tensors.cc +++ /dev/null @@ -1,440 +0,0 @@ -#include "kernels/local_cpu_allocator.h" -#include "local-execution/allocated_tensors.h" -#include "local-execution/gradient_tensor_source.h" -#include "local-execution/loss_tensor_source.h" -#include "local-execution/optimizer_tensor_source.h" -#include "local-execution/unallocated_tensors.h" -#include "pcg/computation_graph.dtg.h" -#include "test/utils/doctest/fmt/pair.h" -#include "test/utils/doctest/fmt/unordered_map.h" -#include "test/utils/doctest/fmt/variant.h" -#include "test/utils/doctest/fmt/vector.h" -#include "test_utils.h" -#include - -using namespace ::FlexFlow; - -TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("UnallocatedTensors") { - MockTensorGuidSource tensor_guid_source; - GradientTensorSource gradient_tensor_source; - OptimizerTensorSource optimizer_tensor_source; - - gradient_tensor_source.reset(); - optimizer_tensor_source.reset(); - - Allocator allocator = create_local_cpu_memory_allocator(); - - tensor_guid_t mock_tensor_1 = tensor_guid_source.new_mock_tensor_guid(); - tensor_guid_t mock_tensor_2 = tensor_guid_source.new_mock_tensor_guid(); - tensor_guid_t mock_tensor_3_with_grad = - tensor_guid_source.new_mock_tensor_guid(); - - gradient_tensor_t grad_tensor = - gradient_tensor_source.new_gradient_tensor(); - optimizer_tensor_t optimizer_tensor_1 = - optimizer_tensor_source.new_optimizer_tensor(); - optimizer_tensor_t optimizer_tensor_2 = - optimizer_tensor_source.new_optimizer_tensor(); - - TensorAttrs tensor_attrs_1_no_grad = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_p, 10_p}}, DataType::FLOAT}, - CreateGrad::NO}; - TensorAttrs tensor_attrs_2_no_grad = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_p, 20_p}}, DataType::FLOAT}, - CreateGrad::NO}; - TensorAttrs tensor_attrs_3_with_grad = TensorAttrs{ - TensorShape{TensorDims{FFOrdered{16_p, 30_p}}, DataType::FLOAT}, - CreateGrad::YES}; - - GenericTensorAccessorW tensor_backing_1 = - allocator.allocate_tensor(tensor_attrs_1_no_grad.shape); - GenericTensorAccessorW tensor_backing_2 = - allocator.allocate_tensor(tensor_attrs_2_no_grad.shape); - GenericTensorAccessorW tensor_backing_3 = - allocator.allocate_tensor(tensor_attrs_3_with_grad.shape); - - std::unordered_map tensor_attrs_mapping = { - {mock_tensor_1, tensor_attrs_1_no_grad}, - {mock_tensor_2, tensor_attrs_2_no_grad}, - {mock_tensor_3_with_grad, tensor_attrs_3_with_grad}, - }; - - SUBCASE("Without optimizer") { - SUBCASE("AllocatedTensors is empty") { - AllocatedTensors empty = AllocatedTensors{{}, {}, {}}; - gradient_tensor_source.reset(); - UnallocatedTensors result = generate_unallocated_tensors( - empty, tensor_attrs_mapping, gradient_tensor_source); - - std::unordered_map - correct_tensor_type_shapes = { - {TensorTypeVariant{mock_tensor_1}, - tensor_attrs_1_no_grad.shape}, - {TensorTypeVariant{mock_tensor_2}, - tensor_attrs_2_no_grad.shape}, - {TensorTypeVariant{mock_tensor_3_with_grad}, - tensor_attrs_3_with_grad.shape}, - {TensorTypeVariant{grad_tensor}, - tensor_attrs_3_with_grad.shape}, - }; - UnallocatedTensors correct = - UnallocatedTensors{correct_tensor_type_shapes, - {{mock_tensor_3_with_grad, grad_tensor}}, - {}}; - CHECK(result == correct); - } - - SUBCASE("AllocatedTensors contains only 1 forward tensor") { - AllocatedTensors allocated_forward_tensors = AllocatedTensors{ - { - {TensorTypeVariant{mock_tensor_1}, tensor_backing_1}, - }, - {}, - {}}; - - gradient_tensor_source.reset(); - UnallocatedTensors result = - generate_unallocated_tensors(allocated_forward_tensors, - tensor_attrs_mapping, - gradient_tensor_source); - std::unordered_map - correct_tensor_type_shapes = { - {TensorTypeVariant{mock_tensor_2}, - tensor_attrs_2_no_grad.shape}, - {TensorTypeVariant{mock_tensor_3_with_grad}, - tensor_attrs_3_with_grad.shape}, - {TensorTypeVariant{grad_tensor}, - tensor_attrs_3_with_grad.shape}, - }; - UnallocatedTensors correct = - UnallocatedTensors{correct_tensor_type_shapes, - {{mock_tensor_3_with_grad, grad_tensor}}, - {}}; - CHECK(result == correct); - } - - SUBCASE("AllocatedTensors contains only forward tensors") { - AllocatedTensors allocated_forward_tensors = AllocatedTensors{ - { - {TensorTypeVariant{mock_tensor_1}, tensor_backing_1}, - {TensorTypeVariant{mock_tensor_2}, tensor_backing_2}, - {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_backing_3}, - }, - {}, - {}}; - - gradient_tensor_source.reset(); - UnallocatedTensors result = - generate_unallocated_tensors(allocated_forward_tensors, - tensor_attrs_mapping, - gradient_tensor_source); - - std::unordered_map - correct_tensor_type_shapes = { - {TensorTypeVariant{grad_tensor}, - tensor_attrs_3_with_grad.shape}, - }; - UnallocatedTensors correct = - UnallocatedTensors{correct_tensor_type_shapes, - {{mock_tensor_3_with_grad, grad_tensor}}, - {}}; - CHECK(result == correct); - } - - SUBCASE("AllocatedTensors contains only gradient tensor") { - - AllocatedTensors allocated_forward_tensors = AllocatedTensors{ - { - {TensorTypeVariant{grad_tensor}, tensor_backing_3}, - }, - {{mock_tensor_3_with_grad, grad_tensor}}, - {}}; - UnallocatedTensors result = - generate_unallocated_tensors(allocated_forward_tensors, - tensor_attrs_mapping, - gradient_tensor_source); - - std::unordered_map - correct_tensor_type_shapes = { - {TensorTypeVariant{mock_tensor_1}, - tensor_attrs_1_no_grad.shape}, - {TensorTypeVariant{mock_tensor_2}, - tensor_attrs_2_no_grad.shape}, - {TensorTypeVariant{mock_tensor_3_with_grad}, - tensor_attrs_3_with_grad.shape}, - }; - UnallocatedTensors correct = - UnallocatedTensors{correct_tensor_type_shapes, {}, {}}; - CHECK(result == correct); - } - - SUBCASE("AllocatedTensors contains mixture") { - - AllocatedTensors allocated_forward_tensors = AllocatedTensors{ - { - {TensorTypeVariant{mock_tensor_1}, tensor_backing_1}, - {TensorTypeVariant{grad_tensor}, tensor_backing_3}, - }, - {{mock_tensor_3_with_grad, grad_tensor}}, - {}}; - UnallocatedTensors result = - generate_unallocated_tensors(allocated_forward_tensors, - tensor_attrs_mapping, - gradient_tensor_source); - - std::unordered_map - correct_tensor_type_shapes = { - {TensorTypeVariant{mock_tensor_2}, - tensor_attrs_2_no_grad.shape}, - {TensorTypeVariant{mock_tensor_3_with_grad}, - tensor_attrs_3_with_grad.shape}, - }; - UnallocatedTensors correct = - UnallocatedTensors{correct_tensor_type_shapes, {}, {}}; - CHECK(result == correct); - } - - SUBCASE("Fully AllocatedTensors") { - - AllocatedTensors allocated_forward_tensors = AllocatedTensors{ - { - {TensorTypeVariant{mock_tensor_1}, tensor_backing_1}, - {TensorTypeVariant{mock_tensor_2}, tensor_backing_2}, - {TensorTypeVariant{mock_tensor_3_with_grad}, tensor_backing_3}, - {TensorTypeVariant{grad_tensor}, tensor_backing_3}, - }, - {{mock_tensor_3_with_grad, grad_tensor}}, - {}}; - UnallocatedTensors result = - generate_unallocated_tensors(allocated_forward_tensors, - tensor_attrs_mapping, - gradient_tensor_source); - - UnallocatedTensors correct = UnallocatedTensors{{}, {}, {}}; - CHECK(result == correct); - } - } - - SUBCASE("With optimizer") { - SUBCASE("SGD Attrs") { - SUBCASE("without momentum") { - double momentum = 0.0; - OptimizerAttrs attrs = - OptimizerAttrs{SGDOptimizerAttrs{0.0, momentum, false, 0.0}}; - AllocatedTensors empty = AllocatedTensors{{}, {}, {}}; - - gradient_tensor_source.reset(); - UnallocatedTensors result = - generate_unallocated_tensors_with_optimizer( - empty, - tensor_attrs_mapping, - gradient_tensor_source, - optimizer_tensor_source, - attrs); - - gradient_tensor_source.reset(); - UnallocatedTensors correct = generate_unallocated_tensors( - empty, tensor_attrs_mapping, gradient_tensor_source); - CHECK(result == correct); - } - SUBCASE("with momentum") { - double momentum = 0.9; - OptimizerAttrs attrs = - OptimizerAttrs{SGDOptimizerAttrs{0.0, momentum, false, 0.0}}; - - SUBCASE("unallocated") { - AllocatedTensors empty = AllocatedTensors{{}, {}, {}}; - - gradient_tensor_source.reset(); - optimizer_tensor_source.reset(); - UnallocatedTensors result = - generate_unallocated_tensors_with_optimizer( - empty, - tensor_attrs_mapping, - gradient_tensor_source, - optimizer_tensor_source, - attrs); - - std::unordered_map - correct_tensor_type_shapes = { - {TensorTypeVariant{mock_tensor_1}, - tensor_attrs_1_no_grad.shape}, - {TensorTypeVariant{mock_tensor_2}, - tensor_attrs_2_no_grad.shape}, - {TensorTypeVariant{mock_tensor_3_with_grad}, - tensor_attrs_3_with_grad.shape}, - {TensorTypeVariant{grad_tensor}, - tensor_attrs_3_with_grad.shape}, - {TensorTypeVariant{optimizer_tensor_1}, - tensor_attrs_3_with_grad.shape}, - }; - UnallocatedTensors correct = UnallocatedTensors{ - correct_tensor_type_shapes, - {{mock_tensor_3_with_grad, grad_tensor}}, - {{mock_tensor_3_with_grad, {optimizer_tensor_1}}}}; - - CHECK(result == correct); - } - - SUBCASE("allocated") { - - AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{ - {{TensorTypeVariant{optimizer_tensor_1}, tensor_backing_3}}, - {}, - {{mock_tensor_3_with_grad, {optimizer_tensor_1}}}}; - - gradient_tensor_source.reset(); - UnallocatedTensors result = - generate_unallocated_tensors_with_optimizer( - allocated_optimizer_tensor, - tensor_attrs_mapping, - gradient_tensor_source, - optimizer_tensor_source, - attrs); - - std::unordered_map - correct_tensor_type_shapes = { - {TensorTypeVariant{mock_tensor_1}, - tensor_attrs_1_no_grad.shape}, - {TensorTypeVariant{mock_tensor_2}, - tensor_attrs_2_no_grad.shape}, - {TensorTypeVariant{mock_tensor_3_with_grad}, - tensor_attrs_3_with_grad.shape}, - {TensorTypeVariant{grad_tensor}, - tensor_attrs_3_with_grad.shape}, - }; - UnallocatedTensors correct = - UnallocatedTensors{correct_tensor_type_shapes, - {{mock_tensor_3_with_grad, grad_tensor}}, - {}}; - - CHECK(result == correct); - } - } - } - SUBCASE("Adam Attrs") { - OptimizerAttrs attrs = - OptimizerAttrs{AdamOptimizerAttrs{/*alpha=*/0.001, - /*beta1=*/0.9, - /*beta2=*/0.999, - /*weight_decay=*/0.001, - /*alpha_t=*/0.001, - /*beta_t=*/0.9, - /*beta2_t=*/0.999, - /*epsilon=*/1e-8}}; - SUBCASE("Empty") { - AllocatedTensors empty = AllocatedTensors{{}, {}, {}}; - - gradient_tensor_source.reset(); - optimizer_tensor_source.reset(); - UnallocatedTensors result = - generate_unallocated_tensors_with_optimizer( - empty, - tensor_attrs_mapping, - gradient_tensor_source, - optimizer_tensor_source, - attrs); - - std::unordered_map - correct_tensor_type_shapes = { - {TensorTypeVariant{mock_tensor_1}, - tensor_attrs_1_no_grad.shape}, - {TensorTypeVariant{mock_tensor_2}, - tensor_attrs_2_no_grad.shape}, - {TensorTypeVariant{mock_tensor_3_with_grad}, - tensor_attrs_3_with_grad.shape}, - {TensorTypeVariant{grad_tensor}, - tensor_attrs_3_with_grad.shape}, - {TensorTypeVariant{optimizer_tensor_1}, - tensor_attrs_3_with_grad.shape}, - {TensorTypeVariant{optimizer_tensor_2}, - tensor_attrs_3_with_grad.shape}, - }; - UnallocatedTensors correct = - UnallocatedTensors{correct_tensor_type_shapes, - {{mock_tensor_3_with_grad, grad_tensor}}, - {{mock_tensor_3_with_grad, - {optimizer_tensor_1, optimizer_tensor_2}}}}; - - CHECK(result == correct); - } - SUBCASE("Partially allocated") { - gradient_tensor_source.reset(); - optimizer_tensor_source.reset(); - optimizer_tensor_t optimizer_tensor_pre_allocated = - optimizer_tensor_source.new_optimizer_tensor(); - AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{ - {{TensorTypeVariant{optimizer_tensor_pre_allocated}, - tensor_backing_3}}, - {}, - {{mock_tensor_3_with_grad, {optimizer_tensor_pre_allocated}}}}; - - UnallocatedTensors result = - generate_unallocated_tensors_with_optimizer( - allocated_optimizer_tensor, - tensor_attrs_mapping, - gradient_tensor_source, - optimizer_tensor_source, - attrs); - - std::unordered_map - correct_tensor_type_shapes = { - {TensorTypeVariant{mock_tensor_1}, - tensor_attrs_1_no_grad.shape}, - {TensorTypeVariant{mock_tensor_2}, - tensor_attrs_2_no_grad.shape}, - {TensorTypeVariant{mock_tensor_3_with_grad}, - tensor_attrs_3_with_grad.shape}, - {TensorTypeVariant{grad_tensor}, - tensor_attrs_3_with_grad.shape}, - {TensorTypeVariant{optimizer_tensor_2}, - tensor_attrs_3_with_grad.shape}, - }; - UnallocatedTensors correct = UnallocatedTensors{ - correct_tensor_type_shapes, - {{mock_tensor_3_with_grad, grad_tensor}}, - {{mock_tensor_3_with_grad, {optimizer_tensor_2}}}}; - - CHECK(result == correct); - } - - SUBCASE("Fully allocated") { - AllocatedTensors allocated_optimizer_tensor = AllocatedTensors{ - {{TensorTypeVariant{optimizer_tensor_1}, tensor_backing_3}, - {TensorTypeVariant{optimizer_tensor_2}, tensor_backing_3}}, - {}, - {{mock_tensor_3_with_grad, - {optimizer_tensor_1, optimizer_tensor_2}}}}; - - gradient_tensor_source.reset(); - UnallocatedTensors result = - generate_unallocated_tensors_with_optimizer( - allocated_optimizer_tensor, - tensor_attrs_mapping, - gradient_tensor_source, - optimizer_tensor_source, - attrs); - - std::unordered_map - correct_tensor_type_shapes = { - {TensorTypeVariant{mock_tensor_1}, - tensor_attrs_1_no_grad.shape}, - {TensorTypeVariant{mock_tensor_2}, - tensor_attrs_2_no_grad.shape}, - {TensorTypeVariant{mock_tensor_3_with_grad}, - tensor_attrs_3_with_grad.shape}, - {TensorTypeVariant{grad_tensor}, - tensor_attrs_3_with_grad.shape}, - }; - UnallocatedTensors correct = - UnallocatedTensors{correct_tensor_type_shapes, - {{mock_tensor_3_with_grad, grad_tensor}}, - {}}; - - CHECK(result == correct); - } - } - } - } -} diff --git a/lib/models/src/models/bert/bert.cc b/lib/models/src/models/bert/bert.cc index bfcab8ffbf..e7b82d012f 100644 --- a/lib/models/src/models/bert/bert.cc +++ b/lib/models/src/models/bert/bert.cc @@ -1,5 +1,6 @@ #include "models/bert/bert.h" #include "op-attrs/initializers/truncated_normal_initializer_attrs.dtg.h" +#include "op-attrs/tensor_dims.h" #include "op-attrs/tensor_shape.h" #include "pcg/computation_graph.h" @@ -57,37 +58,37 @@ tensor_guid_t tensor_guid_t const &input, InitializerAttrs const &bias_initializer, InitializerAttrs const &projection_initializer) { - assert(num_dims(cgb.get_shape(input)) == 3); - std::vector layer_norm_axis = { + ASSERT(get_num_dims(cgb.get_shape(input).dims) == 3); + std::set layer_norm_axis = { relative_ff_dim_t{-1}}; // Apply layernorm across the last dim positive_int kdim = positive_int{config.dim_feedforward / config.num_heads}; positive_int vdim = positive_int{config.dim_feedforward / config.num_heads}; tensor_guid_t self_attention = - cgb.multihead_attention(input, - input, - input, - config.hidden_size, - config.num_heads, - kdim, - vdim, + cgb.multihead_attention(/*query=*/input, + /*key=*/input, + /*value=*/input, + /*embed_dim=*/config.hidden_size, + /*num_heads=*/config.num_heads, + /*kdim=*/kdim, + /*vdim=*/vdim, /*dropout=*/config.attention_probs_dropout_prob, /*bias=*/true, /*add_bias_kv=*/false, /*add_zero_attn=*/false, /*initializer=*/projection_initializer); - assert(are_tensor_guid_shapes_equivalent( + ASSERT(are_tensor_guid_shapes_equivalent( cgb.computation_graph, input, self_attention)); tensor_guid_t normalized = cgb.layer_norm(cgb.add(self_attention, input), layer_norm_axis, /*elementwise_affine=*/true, config.layer_norm_eps); - assert(are_tensor_guid_shapes_equivalent( + ASSERT(are_tensor_guid_shapes_equivalent( cgb.computation_graph, input, normalized)); tensor_guid_t feedforward_output = create_feedforward_network( cgb, config, normalized, bias_initializer, projection_initializer); - assert(are_tensor_guid_shapes_equivalent( + ASSERT(are_tensor_guid_shapes_equivalent( cgb.computation_graph, input, feedforward_output)); return cgb.layer_norm(cgb.add(normalized, feedforward_output), layer_norm_axis, @@ -138,7 +139,7 @@ ComputationGraph get_bert_computation_graph(BertConfig const &config) { tensor_guid_t encoder_output = create_bert_encoder( cgb, config, input, bias_initializer, projection_initializer); - assert(are_tensor_guid_shapes_equivalent( + ASSERT(are_tensor_guid_shapes_equivalent( cgb.computation_graph, input, encoder_output)); tensor_guid_t out_prob = @@ -149,7 +150,7 @@ ComputationGraph get_bert_computation_graph(BertConfig const &config) { /*data_type=*/DataType::FLOAT, /*projection_initializer=*/projection_initializer, /*bias_initializer=*/bias_initializer)); - assert( + ASSERT( (cgb.get_shape(out_prob) == TensorShape{ TensorDims{FFOrdered{ diff --git a/lib/models/src/models/candle_uno/candle_uno.cc b/lib/models/src/models/candle_uno/candle_uno.cc index 8bbbccdbaf..13dd650c2c 100644 --- a/lib/models/src/models/candle_uno/candle_uno.cc +++ b/lib/models/src/models/candle_uno/candle_uno.cc @@ -85,7 +85,8 @@ ComputationGraph for (auto const &input_feature : config.input_features) { std::string const &feature_name = input_feature.second; positive_int shape = config.feature_shapes.at(feature_name); - tensor_guid_t input = create_input_tensor({config.batch_size, shape}); + tensor_guid_t input = + create_input_tensor(FFOrdered{config.batch_size, shape}); all_inputs.push_back(input); if (contains(input_models, feature_name)) { diff --git a/lib/models/src/models/dlrm/dlrm.cc b/lib/models/src/models/dlrm/dlrm.cc index d1dd52b4da..8e06a2dd6a 100644 --- a/lib/models/src/models/dlrm/dlrm.cc +++ b/lib/models/src/models/dlrm/dlrm.cc @@ -129,11 +129,12 @@ ComputationGraph get_dlrm_computation_graph(DLRMConfig const &config) { std::vector sparse_inputs = repeat(num_elements(config.embedding_size), [&]() { return create_input_tensor( - {config.batch_size, config.embedding_bag_size}, DataType::INT64); + FFOrdered{config.batch_size, config.embedding_bag_size}, + DataType::INT64); }); tensor_guid_t dense_input = create_input_tensor( - {config.batch_size, config.dense_arch_layer_sizes.front()}, + FFOrdered{config.batch_size, config.dense_arch_layer_sizes.front()}, DataType::FLOAT); // Construct the model diff --git a/lib/models/src/models/transformer/transformer.cc b/lib/models/src/models/transformer/transformer.cc index dfc40a5720..5298c7682b 100644 --- a/lib/models/src/models/transformer/transformer.cc +++ b/lib/models/src/models/transformer/transformer.cc @@ -32,7 +32,7 @@ tensor_guid_t create_feedforward_network(ComputationGraphBuilder &cgb, tensor_guid_t create_transformer_encoder_layer(ComputationGraphBuilder &cgb, TransformerConfig const &config, tensor_guid_t const &input) { - std::vector layer_norm_axis = { + std::set layer_norm_axis = { relative_ff_dim_t{-1}}; // Normalize the last dim positive_int kdim = positive_int{config.dim_feedforward / config.num_heads}; positive_int vdim = positive_int{config.dim_feedforward / config.num_heads}; @@ -81,7 +81,7 @@ tensor_guid_t TransformerConfig const &config, tensor_guid_t const &input, tensor_guid_t const &encoder_output) { - std::vector layer_norm_axis = { + std::set layer_norm_axis = { relative_ff_dim_t{-1}}; // Normalize the last dim positive_int kdim = positive_int{config.dim_feedforward / config.num_heads}; positive_int vdim = positive_int{config.dim_feedforward / config.num_heads}; diff --git a/lib/op-attrs/include/op-attrs/datatype.h b/lib/op-attrs/include/op-attrs/datatype.h index ad45dcb13c..eab346f41f 100644 --- a/lib/op-attrs/include/op-attrs/datatype.h +++ b/lib/op-attrs/include/op-attrs/datatype.h @@ -3,7 +3,7 @@ #include "op-attrs/datatype.dtg.h" #include "utils/fmt.h" -#include "utils/fp16.h" +#include "utils/half.h" #include "utils/positive_int/positive_int.h" #include diff --git a/lib/op-attrs/include/op-attrs/datatype_value.h b/lib/op-attrs/include/op-attrs/datatype_value.h index b646692de9..fcd1245b54 100644 --- a/lib/op-attrs/include/op-attrs/datatype_value.h +++ b/lib/op-attrs/include/op-attrs/datatype_value.h @@ -3,15 +3,19 @@ #include "op-attrs/datatype.dtg.h" #include "op-attrs/datatype_value.dtg.h" +#include "utils/half.h" namespace FlexFlow { +DataTypeValue make_half_data_type_value(half value); DataTypeValue make_float_data_type_value(float value); DataTypeValue make_double_data_type_value(double value); DataTypeValue make_int32_data_type_value(int32_t value); DataTypeValue make_int64_data_type_value(int64_t value); DataTypeValue make_bool_data_type_value(bool value); +DataTypeValue make_zero_data_type_value_of_type(DataType); + DataType get_data_type_of_data_type_value(DataTypeValue); } // namespace FlexFlow diff --git a/lib/op-attrs/include/op-attrs/datatype_value.variant.toml b/lib/op-attrs/include/op-attrs/datatype_value.variant.toml index 3386e9d131..4c867917b0 100644 --- a/lib/op-attrs/include/op-attrs/datatype_value.variant.toml +++ b/lib/op-attrs/include/op-attrs/datatype_value.variant.toml @@ -9,6 +9,19 @@ features = [ "fmt", ] +includes = [ + "utils/half.h", +] + +src_includes = [ + "utils/json/half.h", + "utils/rapidcheck/half.h", + "utils/fmt/half.h", +] + +[[values]] +type = "half" + [[values]] type = "float" diff --git a/lib/op-attrs/include/op-attrs/ff_dim_t.h b/lib/op-attrs/include/op-attrs/ff_dim_t.h index 5fab792b13..0979201f67 100644 --- a/lib/op-attrs/include/op-attrs/ff_dim_t.h +++ b/lib/op-attrs/include/op-attrs/ff_dim_t.h @@ -6,7 +6,11 @@ #include "rapidcheck.h" namespace FlexFlow { + relative_ff_dim_t relative_ff_dim_t_from_ff_dim_t(ff_dim_t ff_dim); + +ff_dim_t add_to_ff_dim(ff_dim_t ff_dim, int value); + } // namespace FlexFlow namespace rc { diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered.h b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered.h index 92ed211c31..fe2e8d9dc6 100644 --- a/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered.h +++ b/lib/op-attrs/include/op-attrs/ff_ordered/ff_ordered.h @@ -12,16 +12,14 @@ template struct FFOrdered { FFOrdered() {} - FFOrdered(std::initializer_list const &l) : contents(l.begin(), l.end()) {} - - FFOrdered(std::vector const &contents) - : contents(contents.begin(), contents.end()) {} + explicit FFOrdered(std::initializer_list const &l) + : contents(l.begin(), l.end()) {} template - FFOrdered(It begin, It end) : contents(begin, end) {} + explicit FFOrdered(It begin, It end) : contents(begin, end) {} template - FFOrdered(stack_vector const &contents) + explicit FFOrdered(stack_vector const &contents) : contents(contents.begin(), contents.end()) {} T const &at(ff_dim_t idx) const { @@ -190,7 +188,8 @@ namespace nlohmann { template struct adl_serializer<::FlexFlow::FFOrdered> { static ::FlexFlow::FFOrdered from_json(nlohmann::json const &j) { - return {j.template get>()}; + std::vector v = j.template get>(); + return ::FlexFlow::FFOrdered(v.cbegin(), v.cend()); } static void to_json(nlohmann::json &j, ::FlexFlow::FFOrdered const &x) { diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/filtrans.h b/lib/op-attrs/include/op-attrs/ff_ordered/filtrans.h new file mode 100644 index 0000000000..d41e68342a --- /dev/null +++ b/lib/op-attrs/include/op-attrs/ff_ordered/filtrans.h @@ -0,0 +1,20 @@ +#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_FILTRANS_H +#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_FILTRANS_H + +#include "op-attrs/ff_ordered/ff_ordered.h" +#include "op-attrs/ff_ordered/ff_ordered_of.h" +#include "utils/containers/filtrans.h" +#include "utils/containers/vector_of.h" + +namespace FlexFlow { + +template >> +FFOrdered filtrans(FFOrdered const &v, F &&f) { + return ff_ordered_of(filtrans(vector_of(v), f)); +} + +} // namespace FlexFlow + +#endif diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/reversed.h b/lib/op-attrs/include/op-attrs/ff_ordered/reversed.h new file mode 100644 index 0000000000..0986bf560d --- /dev/null +++ b/lib/op-attrs/include/op-attrs/ff_ordered/reversed.h @@ -0,0 +1,16 @@ +#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_REVERSED_H +#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_REVERSED_H + +#include "op-attrs/ff_ordered/ff_ordered.h" + +namespace FlexFlow { + +template +FFOrdered reversed(FFOrdered const &t) { + FFOrdered result(std::crbegin(t), std::crend(t)); + return result; +} + +} // namespace FlexFlow + +#endif diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/slice.h b/lib/op-attrs/include/op-attrs/ff_ordered/slice.h index 79217c4cc3..c8ca49d4cf 100644 --- a/lib/op-attrs/include/op-attrs/ff_ordered/slice.h +++ b/lib/op-attrs/include/op-attrs/ff_ordered/slice.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_SLICE_H #include "op-attrs/ff_ordered/ff_ordered.h" +#include "op-attrs/ff_ordered/ff_ordered_of.h" #include "utils/containers/slice.h" #include "utils/containers/transform.h" #include "utils/containers/vector_of.h" @@ -15,7 +16,7 @@ FFOrdered ff_dim_t_nonoverloaded_slice(FFOrdered const &d, int raw_start = start.value.unwrap_nonnegative(); std::optional raw_end = transform( end, [](ff_dim_t const &i) { return i.value.unwrap_nonnegative(); }); - return FFOrdered{slice(vector_of(d), raw_start, raw_end)}; + return ff_ordered_of(slice(vector_of(d), raw_start, raw_end)); } template @@ -27,7 +28,7 @@ FFOrdered relative_ff_dim_t_nonoverloaded_slice( std::optional raw_end = transform(end, [](relative_ff_dim_t const &i) { return i.value; }); - return FFOrdered{slice(vector_of(d), raw_start, raw_end)}; + return ff_ordered_of(slice(vector_of(d), raw_start, raw_end)); } template diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/transform.h b/lib/op-attrs/include/op-attrs/ff_ordered/transform.h index 3a8eeb9ecf..c7ee3c2c54 100644 --- a/lib/op-attrs/include/op-attrs/ff_ordered/transform.h +++ b/lib/op-attrs/include/op-attrs/ff_ordered/transform.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_DIM_ORDERED_TRANSFORM_H #include "op-attrs/ff_ordered/ff_ordered.h" +#include "op-attrs/ff_ordered/ff_ordered_of.h" #include "utils/containers/vector_of.h" #include "utils/containers/vector_transform.h" @@ -9,7 +10,7 @@ namespace FlexFlow { template > FFOrdered transform(FFOrdered const &d, F &&f) { - return FFOrdered{vector_transform(vector_of(d), f)}; + return ff_ordered_of(vector_transform(vector_of(d), f)); } } // namespace FlexFlow diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/zip.h b/lib/op-attrs/include/op-attrs/ff_ordered/zip.h index fe207740f7..42ca3d69a3 100644 --- a/lib/op-attrs/include/op-attrs/ff_ordered/zip.h +++ b/lib/op-attrs/include/op-attrs/ff_ordered/zip.h @@ -2,6 +2,7 @@ #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_ZIP_H #include "op-attrs/ff_ordered/ff_ordered.h" +#include "op-attrs/ff_ordered/ff_ordered_of.h" #include "utils/containers/vector_of.h" #include "utils/containers/zip.h" @@ -10,7 +11,7 @@ namespace FlexFlow { template FFOrdered> zip(FFOrdered const &lhs, FFOrdered const &rhs) { - return FFOrdered>{zip(vector_of(lhs), vector_of(rhs))}; + return ff_ordered_of(zip(vector_of(lhs), vector_of(rhs))); } } // namespace FlexFlow diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/zip_with.cc b/lib/op-attrs/include/op-attrs/ff_ordered/zip_with.cc new file mode 100644 index 0000000000..63be94ab9c --- /dev/null +++ b/lib/op-attrs/include/op-attrs/ff_ordered/zip_with.cc @@ -0,0 +1,14 @@ +#include "op-attrs/ff_ordered/zip_with.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T1 = value_type<0>; +using T2 = value_type<1>; +using Result = value_type<2>; +using F = std::function; + +template FFOrdered + zip_with(FFOrdered const &, FFOrdered const &, F &&); + +} // namespace FlexFlow diff --git a/lib/op-attrs/include/op-attrs/ff_ordered/zip_with.h b/lib/op-attrs/include/op-attrs/ff_ordered/zip_with.h new file mode 100644 index 0000000000..25ae7e5a55 --- /dev/null +++ b/lib/op-attrs/include/op-attrs/ff_ordered/zip_with.h @@ -0,0 +1,22 @@ +#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_ZIP_WITH_H +#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_FF_ORDERED_ZIP_WITH_H + +#include "op-attrs/ff_ordered/ff_ordered.h" +#include "op-attrs/ff_ordered/ff_ordered_of.h" +#include "utils/containers/vector_of.h" +#include "utils/containers/zip_with.h" + +namespace FlexFlow { + +template > +FFOrdered + zip_with(FFOrdered const &lhs, FFOrdered const &rhs, F &&f) { + return ff_ordered_of(zip_with(vector_of(lhs), vector_of(rhs), f)); +} + +} // namespace FlexFlow + +#endif diff --git a/lib/op-attrs/include/op-attrs/ops/gather_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/gather_attrs.struct.toml index 66d475aa46..f76c7c683f 100644 --- a/lib/op-attrs/include/op-attrs/ops/gather_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/gather_attrs.struct.toml @@ -10,8 +10,11 @@ features = [ ] includes = [ + "op-attrs/ff_dim_t.dtg.h", +] + +src_includes = [ "op-attrs/ff_dim_t.h", - "op-attrs/ff_dim_t.dtg.h" ] [[fields]] diff --git a/lib/op-attrs/include/op-attrs/ops/layer_norm_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/layer_norm_attrs.struct.toml index d2a539e140..12e29d8a60 100644 --- a/lib/op-attrs/include/op-attrs/ops/layer_norm_attrs.struct.toml +++ b/lib/op-attrs/include/op-attrs/ops/layer_norm_attrs.struct.toml @@ -10,14 +10,19 @@ features = [ ] includes = [ - "op-attrs/ff_dim_t.h", "op-attrs/ff_dim_t.dtg.h", - "utils/stack_vector/stack_vector.h", + "", +] + +src_includes = [ + "utils/fmt/set.h", + "utils/hash/set.h", + "op-attrs/ff_dim_t.h", ] [[fields]] name = "axes" -type = "::FlexFlow::stack_vector<::FlexFlow::ff_dim_t, MAX_TENSOR_DIM>" +type = "std::set<::FlexFlow::ff_dim_t>" [[fields]] name = "elementwise_affine" diff --git a/lib/op-attrs/include/op-attrs/tensor_dims.h b/lib/op-attrs/include/op-attrs/tensor_dims.h index a21602e28c..0f5b987944 100644 --- a/lib/op-attrs/include/op-attrs/tensor_dims.h +++ b/lib/op-attrs/include/op-attrs/tensor_dims.h @@ -3,21 +3,55 @@ #include "op-attrs/parallel_tensor_dims.dtg.h" #include "op-attrs/tensor_dims.dtg.h" +#include "op-attrs/tensor_dims_coord.dtg.h" +#include "utils/bidict/bidict.h" namespace FlexFlow { FFOrdered const &ff_ordered(TensorDims const &); -nonnegative_int num_dims(TensorDims const &); +bool tensor_dims_has_dim(TensorDims const &, ff_dim_t); + +nonnegative_int get_num_dims(TensorDims const &); + positive_int dim_at_idx(TensorDims const &, relative_ff_dim_t); positive_int &dim_at_idx(TensorDims &, relative_ff_dim_t); + +positive_int dim_at_idx(TensorDims const &, ff_dim_t); +positive_int &dim_at_idx(TensorDims &, ff_dim_t); + +std::optional try_dim_at_idx(TensorDims const &, + relative_ff_dim_t); +std::optional try_dim_at_idx(TensorDims const &, ff_dim_t); + positive_int get_num_elements(TensorDims const &); bool tensor_dims_is_broadcastable_to(TensorDims const &curr, TensorDims const &goal); + +bool tensor_dims_contains_coord(TensorDims const &tensor_dims, + TensorDimsCoord const &coord); + +TensorDimsCoord get_broadcast_src_coord(TensorDims const &input_dims, + TensorDims const &output_dims, + TensorDimsCoord const &dst_coord); + +std::unordered_set + get_tensor_dims_coord_set(TensorDims const &tensor_dims); + +std::unordered_set get_ff_dim_t_set(TensorDims const &); + std::optional get_broadcast_target_dims(std::unordered_set const &); +TensorDims + tensor_dims_drop_dims(TensorDims const &dims, + std::function const &should_drop_dim); + +TensorDims slice_tensor_dims(TensorDims const &, + ff_dim_t const &start, + std::optional const &stop); + TensorDims slice_tensor_dims(TensorDims const &, relative_ff_dim_t const &start, std::optional const &stop); diff --git a/lib/op-attrs/include/op-attrs/tensor_dims_coord.h b/lib/op-attrs/include/op-attrs/tensor_dims_coord.h new file mode 100644 index 0000000000..44448c5f96 --- /dev/null +++ b/lib/op-attrs/include/op-attrs/tensor_dims_coord.h @@ -0,0 +1,17 @@ +#ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_TENSOR_DIMS_COORD_H +#define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_TENSOR_DIMS_COORD_H + +#include "op-attrs/tensor_dims_coord.dtg.h" + +namespace FlexFlow { + +nonnegative_int + tensor_dims_coord_get_num_dims(TensorDimsCoord const &tensor_dims_coord); + +TensorDimsCoord tensor_dims_coord_drop_dims( + TensorDimsCoord const &coord, + std::function const &should_drop_dim); + +} // namespace FlexFlow + +#endif diff --git a/lib/kernels/include/kernels/array_coord.struct.toml b/lib/op-attrs/include/op-attrs/tensor_dims_coord.struct.toml similarity index 74% rename from lib/kernels/include/kernels/array_coord.struct.toml rename to lib/op-attrs/include/op-attrs/tensor_dims_coord.struct.toml index 8ce121f2bf..53f4405389 100644 --- a/lib/kernels/include/kernels/array_coord.struct.toml +++ b/lib/op-attrs/include/op-attrs/tensor_dims_coord.struct.toml @@ -1,17 +1,16 @@ namespace = "FlexFlow" -name = "ArrayCoord" +name = "TensorDimsCoord" features = [ "eq", "ord", "hash", - "fmt", - "rapidcheck", "json", + "fmt", ] includes = [ "op-attrs/ff_ordered/ff_ordered.h", - "utils/nonnegative_int/nonnegative_int.h" + "utils/nonnegative_int/nonnegative_int.h", ] [[fields]] diff --git a/lib/op-attrs/include/op-attrs/tensor_shape.h b/lib/op-attrs/include/op-attrs/tensor_shape.h index 3cafdda4b8..403b853fab 100644 --- a/lib/op-attrs/include/op-attrs/tensor_shape.h +++ b/lib/op-attrs/include/op-attrs/tensor_shape.h @@ -2,14 +2,15 @@ #define _FLEXFLOW_OPATTRS_TENSOR_SHAPE_H #include "op-attrs/tensor_shape.dtg.h" +#include "utils/units/num_bytes_t.h" namespace FlexFlow { -nonnegative_int num_dims(TensorShape const &); -positive_int dim_at_idx(TensorShape const &, relative_ff_dim_t); -positive_int &dim_at_idx(TensorShape &, relative_ff_dim_t); -positive_int get_num_elements(TensorShape const &); -positive_int get_size_in_bytes(TensorShape const &); +num_bytes_t get_size_in_bytes(TensorShape const &); + +TensorShape tensor_shape_drop_dims( + TensorShape const &coord, + std::function const &should_drop_dim); TensorShape slice_tensor_shape(TensorShape const &, relative_ff_dim_t const &start, diff --git a/lib/op-attrs/src/op-attrs/datatype_value.cc b/lib/op-attrs/src/op-attrs/datatype_value.cc index a4abde2cb4..620b342cd8 100644 --- a/lib/op-attrs/src/op-attrs/datatype_value.cc +++ b/lib/op-attrs/src/op-attrs/datatype_value.cc @@ -1,8 +1,13 @@ #include "op-attrs/datatype_value.h" #include "utils/overload.h" +#include namespace FlexFlow { +DataTypeValue make_half_data_type_value(half value) { + return DataTypeValue{value}; +} + DataTypeValue make_float_data_type_value(float value) { return DataTypeValue{value}; } @@ -25,6 +30,7 @@ DataTypeValue make_bool_data_type_value(bool value) { DataType get_data_type_of_data_type_value(DataTypeValue value) { return value.visit(overload{ + [](half) { return DataType::HALF; }, [](float) { return DataType::FLOAT; }, [](double) { return DataType::DOUBLE; }, [](int32_t) { return DataType::INT32; }, @@ -33,4 +39,36 @@ DataType get_data_type_of_data_type_value(DataTypeValue value) { }); } +DataTypeValue make_zero_data_type_value_of_type(DataType data_type) { + std::optional result = std::nullopt; + + switch (data_type) { + case DataType::HALF: + result = make_half_data_type_value(0.0); + break; + case DataType::FLOAT: + result = make_float_data_type_value(0.0); + break; + case DataType::DOUBLE: + result = make_double_data_type_value(0.0); + break; + case DataType::INT32: + result = make_int32_data_type_value(0); + break; + case DataType::INT64: + result = make_int64_data_type_value(0); + break; + case DataType::BOOL: + result = make_bool_data_type_value(false); + break; + default: + PANIC("Unhandled DataType value", data_type); + }; + + ASSERT(result.has_value()); + ASSERT(get_data_type_of_data_type_value(result.value()) == data_type); + + return result.value(); +} + } // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/ff_dim_t.cc b/lib/op-attrs/src/op-attrs/ff_dim_t.cc index 44672fc391..63c783d909 100644 --- a/lib/op-attrs/src/op-attrs/ff_dim_t.cc +++ b/lib/op-attrs/src/op-attrs/ff_dim_t.cc @@ -1,9 +1,16 @@ #include "op-attrs/ff_dim_t.h" namespace FlexFlow { + relative_ff_dim_t relative_ff_dim_t_from_ff_dim_t(ff_dim_t ff_dim) { + return relative_ff_dim_t{ff_dim.value.unwrap_nonnegative()}; } + +ff_dim_t add_to_ff_dim(ff_dim_t ff_dim, int value) { + return ff_dim_t{nonnegative_int{ff_dim.value.unwrap_nonnegative() + value}}; +} + } // namespace FlexFlow namespace rc { diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/filtrans.cc b/lib/op-attrs/src/op-attrs/ff_ordered/filtrans.cc new file mode 100644 index 0000000000..ff5e4c4af7 --- /dev/null +++ b/lib/op-attrs/src/op-attrs/ff_ordered/filtrans.cc @@ -0,0 +1,12 @@ +#include "op-attrs/ff_ordered/filtrans.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using In = value_type<0>; +using Out = value_type<1>; +using F = std::function(In const &)>; + +template FFOrdered filtrans(FFOrdered const &, F &&); + +} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/ff_ordered/reversed.cc b/lib/op-attrs/src/op-attrs/ff_ordered/reversed.cc new file mode 100644 index 0000000000..5e8f2eb6e3 --- /dev/null +++ b/lib/op-attrs/src/op-attrs/ff_ordered/reversed.cc @@ -0,0 +1,10 @@ +#include "op-attrs/ff_ordered/reversed.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; + +template FFOrdered reversed(FFOrdered const &); + +} // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/ops/attention.cc b/lib/op-attrs/src/op-attrs/ops/attention.cc index 5800f086ef..cc6ef8cfac 100644 --- a/lib/op-attrs/src/op-attrs/ops/attention.cc +++ b/lib/op-attrs/src/op-attrs/ops/attention.cc @@ -2,6 +2,7 @@ #include "op-attrs/ops/attention/multihead_attention_inputs.h" #include "op-attrs/ops/attention/multihead_attention_parallel_inputs.h" #include "op-attrs/parallel_tensor_shape.h" +#include "op-attrs/tensor_dims.h" #include "op-attrs/tensor_shape.h" #include "utils/containers/extend.h" #include "utils/expected.h" @@ -34,15 +35,15 @@ positive_int get_oProjSize(MultiHeadAttentionAttrs const &attrs) { } positive_int get_qSize(TensorShape const &query_shape) { - return dim_at_idx(query_shape, relative_ff_dim_t{0}); + return dim_at_idx(query_shape.dims, relative_ff_dim_t{0}); } positive_int get_kSize(TensorShape const &key_shape) { - return dim_at_idx(key_shape, relative_ff_dim_t{0}); + return dim_at_idx(key_shape.dims, relative_ff_dim_t{0}); } positive_int get_vSize(TensorShape const &value_shape) { - return dim_at_idx(value_shape, relative_ff_dim_t{0}); + return dim_at_idx(value_shape.dims, relative_ff_dim_t{0}); } positive_int get_qSize(MultiHeadAttentionParallelInputs const &inputs) { diff --git a/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc b/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc index 7bf3b9d91e..102e54cbe3 100644 --- a/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc +++ b/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc @@ -1,4 +1,5 @@ #include "op-attrs/ops/attention/multihead_attention_inputs.h" +#include "op-attrs/tensor_dims.h" #include "op-attrs/tensor_shape.h" namespace FlexFlow { @@ -12,28 +13,28 @@ tl::expected parse_attention_input_shape(TensorShape const &input_q, TensorShape const &input_k, TensorShape const &input_v) { - if (num_dims(input_q) != 3) { + if (get_num_dims(input_q.dims) != 3) { return tl::unexpected( fmt::format("Query input has incorrect number of dims: {} != {}", - num_dims(input_q), + get_num_dims(input_q.dims), 3)); } - if (num_dims(input_k) != 3) { + if (get_num_dims(input_k.dims) != 3) { return tl::unexpected( fmt::format("Key input has incorrect number of dims: {} != {}", - num_dims(input_k), + get_num_dims(input_k.dims), 3)); } - if (num_dims(input_v) != 3) { + if (get_num_dims(input_v.dims) != 3) { return tl::unexpected( fmt::format("Value input has incorrect number of dims: {} != {}", - num_dims(input_v), + get_num_dims(input_v.dims), 3)); } - positive_int seq_len_q = dim_at_idx(input_q, relative_ff_dim_t{-2}); - positive_int seq_len_k = dim_at_idx(input_k, relative_ff_dim_t{-2}); - positive_int seq_len_v = dim_at_idx(input_v, relative_ff_dim_t{-2}); + positive_int seq_len_q = dim_at_idx(input_q.dims, relative_ff_dim_t{-2}); + positive_int seq_len_k = dim_at_idx(input_k.dims, relative_ff_dim_t{-2}); + positive_int seq_len_v = dim_at_idx(input_v.dims, relative_ff_dim_t{-2}); if (!all_same(seq_len_q, seq_len_k, seq_len_v)) { return tl::unexpected(fmt::format( @@ -43,9 +44,9 @@ tl::expected seq_len_v)); } - positive_int batch_size_q = dim_at_idx(input_q, relative_ff_dim_t{-3}); - positive_int batch_size_k = dim_at_idx(input_k, relative_ff_dim_t{-3}); - positive_int batch_size_v = dim_at_idx(input_v, relative_ff_dim_t{-3}); + positive_int batch_size_q = dim_at_idx(input_q.dims, relative_ff_dim_t{-3}); + positive_int batch_size_k = dim_at_idx(input_k.dims, relative_ff_dim_t{-3}); + positive_int batch_size_v = dim_at_idx(input_v.dims, relative_ff_dim_t{-3}); if (!all_same(batch_size_q, batch_size_k, batch_size_v)) { return tl::unexpected(fmt::format( @@ -63,9 +64,9 @@ tl::expected input_v.data_type)); } - positive_int q_size = dim_at_idx(input_q, relative_ff_dim_t{-1}); - positive_int k_size = dim_at_idx(input_k, relative_ff_dim_t{-1}); - positive_int v_size = dim_at_idx(input_v, relative_ff_dim_t{-1}); + positive_int q_size = dim_at_idx(input_q.dims, relative_ff_dim_t{-1}); + positive_int k_size = dim_at_idx(input_k.dims, relative_ff_dim_t{-1}); + positive_int v_size = dim_at_idx(input_v.dims, relative_ff_dim_t{-1}); return MultiHeadAttentionInputs{ batch_size_q, diff --git a/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc b/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc index 33c4987233..3c76561d17 100644 --- a/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc +++ b/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc @@ -1,5 +1,6 @@ #include "op-attrs/ops/batch_matmul.h" #include "op-attrs/parallel_tensor_shape.h" +#include "op-attrs/tensor_dims.h" namespace FlexFlow { @@ -39,16 +40,16 @@ tl::expected // out will be a (b×n×p) tensor. // https://pytorch.org/docs/stable/generated/torch.bmm.html - if (num_dims(input_lhs) != 3) { + if (get_num_dims(input_lhs.dims) != 3) { return tl::unexpected( fmt::format("LHS input has incorrect number of shard dims: {} != {}", - num_dims(input_lhs), + get_num_dims(input_lhs.dims), 3)); } - if (num_dims(input_rhs) != 3) { + if (get_num_dims(input_rhs.dims) != 3) { return tl::unexpected( fmt::format("RHS input has incorrect number of shard dims: {} != {}", - num_dims(input_rhs), + get_num_dims(input_rhs.dims), 3)); } if (input_lhs.data_type != input_rhs.data_type) { @@ -57,13 +58,13 @@ tl::expected input_rhs.data_type)); } - positive_int lhs_b = dim_at_idx(input_lhs, relative_ff_dim_t{0}); - positive_int n = dim_at_idx(input_lhs, relative_ff_dim_t{1}); - positive_int lhs_m = dim_at_idx(input_lhs, relative_ff_dim_t{2}); + positive_int lhs_b = dim_at_idx(input_lhs.dims, relative_ff_dim_t{0}); + positive_int n = dim_at_idx(input_lhs.dims, relative_ff_dim_t{1}); + positive_int lhs_m = dim_at_idx(input_lhs.dims, relative_ff_dim_t{2}); - positive_int rhs_b = dim_at_idx(input_rhs, relative_ff_dim_t{0}); - positive_int rhs_m = dim_at_idx(input_rhs, relative_ff_dim_t{1}); - positive_int p = dim_at_idx(input_rhs, relative_ff_dim_t{2}); + positive_int rhs_b = dim_at_idx(input_rhs.dims, relative_ff_dim_t{0}); + positive_int rhs_m = dim_at_idx(input_rhs.dims, relative_ff_dim_t{1}); + positive_int p = dim_at_idx(input_rhs.dims, relative_ff_dim_t{2}); if (lhs_b != rhs_b) { return tl::unexpected( diff --git a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc index f42467320b..cfe5bafaba 100644 --- a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc +++ b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc @@ -2,6 +2,7 @@ #include "op-attrs/ff_ordered/concat.h" #include "op-attrs/ff_ordered/slice.h" #include "op-attrs/parallel_tensor_shape.h" +#include "op-attrs/tensor_dims.h" #include "op-attrs/tensor_shape.h" #include "utils/containers/any_of.h" #include "utils/containers/extend.h" @@ -23,7 +24,7 @@ std::vector static std::optional check_input_shape(BatchNormAttrs const &, TensorShape const &input_shape) { - if (num_dims(input_shape) < 2) { + if (get_num_dims(input_shape.dims) < 2) { return fmt::format( "BatchNormAttrs expected input dims >= 2, but received input shape {}", input_shape); @@ -68,7 +69,8 @@ tl::expected return tl::unexpected("No gamma weights exist for attrs.affine = false"); } - positive_int num_channels = dim_at_idx(input_shape, relative_ff_dim_t{1}); + positive_int num_channels = + dim_at_idx(input_shape.dims, relative_ff_dim_t{1}); return TensorShape{ TensorDims{FFOrdered{ diff --git a/lib/op-attrs/src/op-attrs/ops/broadcast.cc b/lib/op-attrs/src/op-attrs/ops/broadcast.cc index 31e241e27b..d84a9ee46e 100644 --- a/lib/op-attrs/src/op-attrs/ops/broadcast.cc +++ b/lib/op-attrs/src/op-attrs/ops/broadcast.cc @@ -13,7 +13,7 @@ RecordFormatter as_dot(BroadcastAttrs const &attrs) { return rr; }; - for (int i = 0; i < num_dims(attrs.target_dims); i++) { + for (int i = 0; i < get_num_dims(attrs.target_dims); i++) { r << kv(fmt::format("target_dims[{}]", i), dim_at_idx(attrs.target_dims, relative_ff_dim_t{i})); } @@ -24,7 +24,7 @@ RecordFormatter as_dot(BroadcastAttrs const &attrs) { tl::expected get_output_shape(BroadcastAttrs const &attrs, TensorShape const &input_shape) { - if (num_dims(attrs.target_dims) < num_dims(input_shape.dims)) { + if (get_num_dims(attrs.target_dims) < get_num_dims(input_shape.dims)) { return tl::unexpected(fmt::format( "get_output_shape for Broadcast expected num_dims(input_dims) <= " "num_dims(target_dims), but recieved input_shape {} with num dims " diff --git a/lib/op-attrs/src/op-attrs/ops/concat.cc b/lib/op-attrs/src/op-attrs/ops/concat.cc index aed118dd62..8f2752b71f 100644 --- a/lib/op-attrs/src/op-attrs/ops/concat.cc +++ b/lib/op-attrs/src/op-attrs/ops/concat.cc @@ -33,7 +33,7 @@ tl::expected } if (!are_all_same(transform( - inputs, [](TensorShape const &s) { return num_dims(s); }))) { + inputs, [](TensorShape const &s) { return get_num_dims(s.dims); }))) { return tl::unexpected( fmt::format("get_output_shape for Concat expected all inputs to have " "the same number of dimensions, but receieved {}", @@ -51,7 +51,7 @@ tl::expected std::vector axis_dim_sizes = transform(inputs, [&](TensorShape const &s) { - return dim_at_idx(s, relative_ff_dim_t_from_ff_dim_t(attrs.axis)); + return dim_at_idx(s.dims, relative_ff_dim_t_from_ff_dim_t(attrs.axis)); }); positive_int output_axis_dim_size = sum(axis_dim_sizes); diff --git a/lib/op-attrs/src/op-attrs/ops/conv_2d.cc b/lib/op-attrs/src/op-attrs/ops/conv_2d.cc index 2ac90c1c9c..6ff1b8a06e 100644 --- a/lib/op-attrs/src/op-attrs/ops/conv_2d.cc +++ b/lib/op-attrs/src/op-attrs/ops/conv_2d.cc @@ -115,7 +115,7 @@ ParallelTensorShape get_kernel_shape(Conv2DAttrs const &attrs, SumDegree sum_degree = SumDegree{1_p}; DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{parsed.sample_dim.degree * parsed.sum_reduction_degree}; - FFOrdered shard_degrees = { + FFOrdered shard_degrees = FFOrdered{ parsed.discard_copy_reduction_degree, parsed.channel_dim.degree, 1_p, @@ -139,7 +139,7 @@ ParallelTensorShape get_bias_shape(Conv2DAttrs const &attrs, DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{parsed.height_dim.degree * parsed.width_dim.degree * parsed.sample_dim.degree}; - FFOrdered shard_degrees = { + FFOrdered shard_degrees = FFOrdered{ parsed.discard_copy_reduction_degree, }; @@ -161,7 +161,7 @@ ParallelTensorShape get_output_shape(Conv2DAttrs const &attrs, SumDegree sum_degree = SumDegree{parsed.sum_reduction_degree * parsed.channel_dim.degree}; DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1_p}; - FFOrdered shard_degrees = { + FFOrdered shard_degrees = FFOrdered{ parsed.sample_dim.degree, parsed.discard_copy_reduction_degree, 1_p, diff --git a/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc b/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc index 75db5c56fb..79bb14f2b2 100644 --- a/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc +++ b/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc @@ -1,22 +1,23 @@ #include "op-attrs/ops/conv_2d/conv_2d_input_shape.h" +#include "op-attrs/tensor_dims.h" #include "op-attrs/tensor_shape.h" namespace FlexFlow { Conv2DInputShape parse_input_shape(TensorShape const &input) { - assert(num_dims(input) == 4); + ASSERT(get_num_dims(input.dims) == 4); - positive_int num_samples = dim_at_idx(input, relative_ff_dim_t{0}); - positive_int in_channels = dim_at_idx(input, relative_ff_dim_t{1}); - positive_int in_height = dim_at_idx(input, relative_ff_dim_t{2}); - positive_int in_width = dim_at_idx(input, relative_ff_dim_t{3}); + positive_int num_samples = dim_at_idx(input.dims, relative_ff_dim_t{0}); + positive_int in_channels = dim_at_idx(input.dims, relative_ff_dim_t{1}); + positive_int in_height = dim_at_idx(input.dims, relative_ff_dim_t{2}); + positive_int in_width = dim_at_idx(input.dims, relative_ff_dim_t{3}); return Conv2DInputShape{ - num_samples, - in_channels, - in_height, - in_width, - input.data_type, + /*num_samples=*/num_samples, + /*num_channels=*/in_channels, + /*height=*/in_height, + /*width=*/in_width, + /*datatype=*/input.data_type, }; } diff --git a/lib/op-attrs/src/op-attrs/ops/embedding.cc b/lib/op-attrs/src/op-attrs/ops/embedding.cc index 809b4cdaf9..e0e1a44b3b 100644 --- a/lib/op-attrs/src/op-attrs/ops/embedding.cc +++ b/lib/op-attrs/src/op-attrs/ops/embedding.cc @@ -3,6 +3,7 @@ #include "op-attrs/ff_ordered/transform.h" #include "op-attrs/ops/embedding_attrs.dtg.h" #include "op-attrs/parallel_tensor_dims.h" +#include "op-attrs/tensor_dims.h" #include "utils/containers/product.h" #include "utils/fmt/optional.h" #include "utils/integer_conversions.h" @@ -52,7 +53,7 @@ tl::expected } TensorShape output = input; - dim_at_idx(output, relative_ff_dim_t{-1}) = attrs.out_channels; + dim_at_idx(output.dims, relative_ff_dim_t{-1}) = attrs.out_channels; output.data_type = attrs.data_type; return output; } @@ -120,7 +121,7 @@ tl::expected [](ShardParallelDim const &d) -> positive_int { return d.degree; }))}; positive_int entry_dim_degree = 1_p; positive_int out_channel_degree = get_discard_copy_degree(input); - FFOrdered shard_degrees = { + FFOrdered shard_degrees = FFOrdered{ entry_dim_degree, out_channel_degree, }; diff --git a/lib/op-attrs/src/op-attrs/ops/flat.cc b/lib/op-attrs/src/op-attrs/ops/flat.cc index a2183a71b4..14180cecf8 100644 --- a/lib/op-attrs/src/op-attrs/ops/flat.cc +++ b/lib/op-attrs/src/op-attrs/ops/flat.cc @@ -26,7 +26,7 @@ TensorShape get_output_shape(FlatAttrs const &attrs, TensorDims{ concat(std::vector{ leading_dims, - {product(flattened_dims)}, + FFOrdered{product(flattened_dims)}, trailing_dims, }), }, @@ -59,7 +59,7 @@ tl::expected /*shard_degrees=*/ concat(std::vector{ slice(input_degrees.shard_degrees, ff_dim_t{0_n}, attrs.start_dim), - {product(flattened_dim_degrees)}, + FFOrdered{product(flattened_dim_degrees)}, slice(input_degrees.shard_degrees, attrs.end_dim, std::nullopt), }), }; diff --git a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc index 3637aacc5c..e0db1cdfe7 100644 --- a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc +++ b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc @@ -2,6 +2,7 @@ #include "op-attrs/ff_ordered/ff_ordered_of.h" #include "op-attrs/ff_ordered/get_idxs.h" #include "op-attrs/parallel_tensor_shape.h" +#include "op-attrs/tensor_dims.h" #include "op-attrs/tensor_shape.h" #include "utils/containers/all_of.h" #include "utils/containers/any_of.h" @@ -9,6 +10,7 @@ #include "utils/containers/extend.h" #include "utils/containers/filter.h" #include "utils/expected.h" +#include "utils/fmt/set.h" namespace FlexFlow { @@ -28,7 +30,7 @@ static std::optional check_input_shape(LayerNormAttrs const &attrs, TensorShape const &input_shape) { if (any_of(attrs.axes, [&](ff_dim_t axis) { - return axis.value >= num_dims(input_shape); + return axis.value >= get_num_dims(input_shape.dims); })) { return fmt::format( "LayerNorm axes {} out-of-bounds for input tensor shape {}", @@ -74,7 +76,7 @@ tl::expected [&](ff_dim_t const &dim_idx) { return !contains(attrs.axes, dim_idx); }); std::vector raw_weight_dims = transform(non_layer_norm_dim_idxs, [&](ff_dim_t const &dim_idx) { - return dim_at_idx(input_shape, + return dim_at_idx(input_shape.dims, relative_ff_dim_t_from_ff_dim_t(dim_idx)); }); diff --git a/lib/op-attrs/src/op-attrs/ops/linear.cc b/lib/op-attrs/src/op-attrs/ops/linear.cc index 32791e81a9..37f504f873 100644 --- a/lib/op-attrs/src/op-attrs/ops/linear.cc +++ b/lib/op-attrs/src/op-attrs/ops/linear.cc @@ -3,6 +3,7 @@ #include "op-attrs/ff_ordered/transform.h" #include "op-attrs/initializers/kaiming_initializer_mode.h" #include "op-attrs/parallel_tensor_shape.h" +#include "op-attrs/tensor_dims.h" #include "op-attrs/tensor_shape.h" #include "utils/containers/product.h" #include "utils/expected.h" @@ -44,11 +45,12 @@ RecordFormatter as_dot(LinearAttrs const &attrs) { tl::expected get_projection_shape(LinearAttrs const &attrs, TensorShape const &input_shape) { - positive_int in_channels = dim_at_idx(input_shape, relative_ff_dim_t{-1}); + positive_int in_channels = + dim_at_idx(input_shape.dims, relative_ff_dim_t{-1}); return TensorShape{ TensorDims{ - FFOrdered{in_channels, attrs.out_channels}, + FFOrdered{attrs.out_channels, in_channels}, }, input_shape.data_type, }; @@ -105,8 +107,8 @@ tl::expected relative_ff_dim_t{0}, relative_ff_dim_t{-1}))}; FFOrdered shard_degrees = FFOrdered{ - shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree, get_discard_copy_degree(input), + shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree, }; return lift_to_parallel_with_degrees( diff --git a/lib/op-attrs/src/op-attrs/ops/pool_2d.cc b/lib/op-attrs/src/op-attrs/ops/pool_2d.cc index 361216cce4..ee75340ed0 100644 --- a/lib/op-attrs/src/op-attrs/ops/pool_2d.cc +++ b/lib/op-attrs/src/op-attrs/ops/pool_2d.cc @@ -15,7 +15,7 @@ tl::expected // AdaptivePool2D semantics pulled from // https://stackoverflow.com/questions/53841509/how-does-adaptive-pooling-in-pytorch-work/63603993 - if (num_dims(input_dims) != 4) { + if (get_num_dims(input_dims) != 4) { return tl::unexpected( fmt::format("make_adaptive_pool2d_attrs expected input tensor to " "have 4 dims, but received dims {}", @@ -119,17 +119,19 @@ static positive_int calculate_output_size(positive_int input_size, tl::expected get_output_shape(Pool2DAttrs const &attrs, TensorShape const &input_shape) { - if (num_dims(input_shape) != 4) { + if (get_num_dims(input_shape.dims) != 4) { return tl::unexpected( fmt::format("get_output_shape for Pool2DAttrs expected input tensor to " "have 4 dims, but received shape {}", input_shape)); } - positive_int num_samples = dim_at_idx(input_shape, relative_ff_dim_t{0}); - positive_int num_channels = dim_at_idx(input_shape, relative_ff_dim_t{1}); - positive_int input_height = dim_at_idx(input_shape, relative_ff_dim_t{2}); - positive_int input_width = dim_at_idx(input_shape, relative_ff_dim_t{3}); + positive_int num_samples = dim_at_idx(input_shape.dims, relative_ff_dim_t{0}); + positive_int num_channels = + dim_at_idx(input_shape.dims, relative_ff_dim_t{1}); + positive_int input_height = + dim_at_idx(input_shape.dims, relative_ff_dim_t{2}); + positive_int input_width = dim_at_idx(input_shape.dims, relative_ff_dim_t{3}); positive_int output_height = calculate_output_size(/*input_size=*/input_height, diff --git a/lib/op-attrs/src/op-attrs/ops/softmax.cc b/lib/op-attrs/src/op-attrs/ops/softmax.cc index 0d55a2ec2c..2c03fe1689 100644 --- a/lib/op-attrs/src/op-attrs/ops/softmax.cc +++ b/lib/op-attrs/src/op-attrs/ops/softmax.cc @@ -1,5 +1,6 @@ #include "op-attrs/ops/softmax.h" #include "op-attrs/parallel_tensor_shape.h" +#include "op-attrs/tensor_dims.h" #include "op-attrs/tensor_shape.h" namespace FlexFlow { @@ -7,7 +8,7 @@ namespace FlexFlow { tl::expected get_output_shape(SoftmaxAttrs const &attrs, TensorShape const &input_shape) { - if (attrs.dim.value >= num_dims(input_shape)) { + if (attrs.dim.value >= get_num_dims(input_shape.dims)) { return tl::unexpected( fmt::format("get_output_shape for Softmax received out-of-bounds " "attrs.dim {} for input tensor shape {}", diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc index dd5230f5a4..1c77bc6ca8 100644 --- a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc +++ b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc @@ -43,9 +43,11 @@ ParallelTensorDimDegrees get_parallel_degrees(ParallelTensorDims const &d) { ParallelTensorDims lift_to_parallel(TensorDims const &dims) { std::vector shard_degrees = - repeat_element(/*num_times=*/num_dims(dims), /*element=*/1_p); - return lift_to_parallel_with_degrees( - dims, SumDegree{1_p}, DiscardCopyDegree{1_p}, shard_degrees); + repeat_element(/*num_times=*/get_num_dims(dims), /*element=*/1_p); + return lift_to_parallel_with_degrees(dims, + SumDegree{1_p}, + DiscardCopyDegree{1_p}, + ff_ordered_of(shard_degrees)); } ParallelTensorDims lift_to_parallel_with_degrees( @@ -61,7 +63,7 @@ ParallelTensorDims lift_to_parallel_with_degrees( return ShardParallelDim{size, degree}; }); - return ParallelTensorDims{FFOrdered{lifted}, + return ParallelTensorDims{ff_ordered_of(lifted), ReplicaParallelDimSet{ sum_degree, discard_copy_degree, diff --git a/lib/op-attrs/src/op-attrs/tensor_dims.cc b/lib/op-attrs/src/op-attrs/tensor_dims.cc index b48a23b281..435f211a01 100644 --- a/lib/op-attrs/src/op-attrs/tensor_dims.cc +++ b/lib/op-attrs/src/op-attrs/tensor_dims.cc @@ -1,15 +1,24 @@ #include "op-attrs/tensor_dims.h" +#include "op-attrs/ff_ordered/enumerate.h" +#include "op-attrs/ff_ordered/filtrans.h" +#include "op-attrs/ff_ordered/get_idxs.h" #include "op-attrs/ff_ordered/slice.h" #include "op-attrs/ff_ordered/zip.h" +#include "op-attrs/ff_ordered/zip_with.h" #include "op-attrs/replica_parallel_dim_set.h" #include "op-attrs/shard_parallel_dim.dtg.h" +#include "utils/containers/all_are_true.h" #include "utils/containers/all_of.h" +#include "utils/containers/cartesian_product.h" +#include "utils/containers/contains.h" #include "utils/containers/product.h" #include "utils/containers/reversed.h" #include "utils/containers/transform.h" +#include "utils/containers/unordered_set_of.h" #include "utils/containers/vector_of.h" #include "utils/containers/zip.h" #include "utils/integer_conversions.h" +#include "utils/nonnegative_int/nonnegative_range.h" #include "utils/nonnegative_int/num_elements.h" namespace FlexFlow { @@ -18,7 +27,11 @@ FFOrdered const &ff_ordered(TensorDims const &dims) { return dims.ff_ordered; } -nonnegative_int num_dims(TensorDims const &dims) { +bool tensor_dims_has_dim(TensorDims const &tensor_dims, ff_dim_t dim) { + return contains(get_idxs(tensor_dims.ff_ordered), dim); +} + +nonnegative_int get_num_dims(TensorDims const &dims) { return num_elements(dims.ff_ordered); } @@ -30,13 +43,39 @@ positive_int &dim_at_idx(TensorDims &dims, relative_ff_dim_t idx) { return dims.ff_ordered.at(idx); } +positive_int dim_at_idx(TensorDims const &dims, ff_dim_t ff_dim_idx) { + return dims.ff_ordered.at(ff_dim_idx); +} + +positive_int &dim_at_idx(TensorDims &dims, ff_dim_t ff_dim_idx) { + return dims.ff_ordered.at(ff_dim_idx); +} + +std::optional try_dim_at_idx(TensorDims const &dims, + relative_ff_dim_t idx) { + if (dims.ff_ordered.idx_is_valid(idx)) { + return dims.ff_ordered.at(idx); + } else { + return std::nullopt; + } +} + +std::optional try_dim_at_idx(TensorDims const &dims, + ff_dim_t idx) { + if (dims.ff_ordered.idx_is_valid(idx)) { + return dims.ff_ordered.at(idx); + } else { + return std::nullopt; + } +} + positive_int get_num_elements(TensorDims const &d) { return product(d.ff_ordered); } bool tensor_dims_is_broadcastable_to(TensorDims const &curr, TensorDims const &goal) { - if (num_dims(curr) > num_dims(goal)) { + if (get_num_dims(curr) > get_num_dims(goal)) { return false; } @@ -53,6 +92,80 @@ bool tensor_dims_is_broadcastable_to(TensorDims const &curr, return true; } +bool tensor_dims_contains_coord(TensorDims const &tensor_dims, + TensorDimsCoord const &coord) { + ASSERT(coord.ff_ordered.size() == get_num_dims(tensor_dims)); + + return all_are_true(zip_with( + coord.ff_ordered, + tensor_dims.ff_ordered, + [](nonnegative_int const &coord_entry, positive_int const &dim_size) { + return coord_entry < dim_size; + })); +} + +TensorDimsCoord get_broadcast_src_coord(TensorDims const &input_dims, + TensorDims const &output_dims, + TensorDimsCoord const &dst_coord) { + ASSERT(tensor_dims_contains_coord(output_dims, dst_coord), + output_dims, + dst_coord); + ASSERT(tensor_dims_is_broadcastable_to(input_dims, output_dims), + input_dims, + output_dims); + + relative_ff_dim_t trailing_start_idx = + relative_ff_dim_t{-1 * get_num_dims(input_dims).unwrap_nonnegative()}; + + FFOrdered trailing_entries = + slice(dst_coord.ff_ordered, trailing_start_idx); + + FFOrdered trailing_dims = + slice(output_dims.ff_ordered, trailing_start_idx); + + TensorDimsCoord result = TensorDimsCoord{ + zip_with(trailing_entries, + input_dims.ff_ordered, + [](nonnegative_int const &coord_entry, + positive_int const &input_dim_size) { + if (input_dim_size == 1) { + return 0_n; + } else { + return coord_entry; + } + }), + }; + + ASSERT(tensor_dims_contains_coord(input_dims, result), + output_dims, + dst_coord, + input_dims, + result); + + return result; +} + +std::unordered_set + get_tensor_dims_coord_set(TensorDims const &tensor_dims) { + std::vector> per_dim_ranges = transform( + vector_of(tensor_dims.ff_ordered), + [](positive_int dim_size) -> std::vector { + return nonnegative_range(dim_size.nonnegative_int_from_positive_int()); + }); + + std::unordered_set> raw_points = + unordered_set_of(cartesian_product(per_dim_ranges)); + + return transform(raw_points, + [](std::vector const &raw_point) { + return TensorDimsCoord{ff_ordered_of(raw_point)}; + }); +} + +std::unordered_set get_ff_dim_t_set(TensorDims const &tensor_dims) { + return unordered_set_of(get_idxs(tensor_dims.ff_ordered)); +} + std::optional get_broadcast_target_dims(std::unordered_set const &dims) { for (TensorDims target_candidate : dims) { @@ -66,6 +179,19 @@ std::optional return std::nullopt; } +TensorDims tensor_dims_drop_dims( + TensorDims const &dims, + std::function const &should_drop_dim) { + std::vector result; + for (ff_dim_t idx : get_idxs(dims.ff_ordered)) { + if (!should_drop_dim(idx)) { + result.push_back(dims.ff_ordered.at(idx)); + } + } + + return TensorDims{ff_ordered_of(result)}; +} + TensorDims slice_tensor_dims(TensorDims const &dims, relative_ff_dim_t const &start, std::optional const &stop) { @@ -74,4 +200,12 @@ TensorDims slice_tensor_dims(TensorDims const &dims, }; } +TensorDims slice_tensor_dims(TensorDims const &dims, + ff_dim_t const &start, + std::optional const &stop) { + return TensorDims{ + slice(dims.ff_ordered, start, stop), + }; +} + } // namespace FlexFlow diff --git a/lib/kernels/src/kernels/array_coord.cc b/lib/op-attrs/src/op-attrs/tensor_dims_coord.cc similarity index 53% rename from lib/kernels/src/kernels/array_coord.cc rename to lib/op-attrs/src/op-attrs/tensor_dims_coord.cc index 0927cb9951..6cdf5711ed 100644 --- a/lib/kernels/src/kernels/array_coord.cc +++ b/lib/op-attrs/src/op-attrs/tensor_dims_coord.cc @@ -1,12 +1,16 @@ -#include "kernels/array_coord.h" +#include "op-attrs/tensor_dims_coord.h" #include "op-attrs/ff_ordered/ff_ordered_of.h" #include "op-attrs/ff_ordered/get_idxs.h" -#include namespace FlexFlow { -ArrayCoord array_coord_drop_dims( - ArrayCoord const &coord, +nonnegative_int + tensor_dims_coord_get_num_dims(TensorDimsCoord const &tensor_dims_coord) { + return nonnegative_int{tensor_dims_coord.ff_ordered.size()}; +} + +TensorDimsCoord tensor_dims_coord_drop_dims( + TensorDimsCoord const &coord, std::function const &should_drop_dim) { std::vector result; for (ff_dim_t idx : get_idxs(coord.ff_ordered)) { @@ -15,7 +19,7 @@ ArrayCoord array_coord_drop_dims( } } - return ArrayCoord{ff_ordered_of(result)}; + return TensorDimsCoord{ff_ordered_of(result)}; } } // namespace FlexFlow diff --git a/lib/op-attrs/src/op-attrs/tensor_shape.cc b/lib/op-attrs/src/op-attrs/tensor_shape.cc index 7a1ba810a7..270ebb9e0c 100644 --- a/lib/op-attrs/src/op-attrs/tensor_shape.cc +++ b/lib/op-attrs/src/op-attrs/tensor_shape.cc @@ -8,24 +8,18 @@ namespace FlexFlow { -nonnegative_int num_dims(TensorShape const &s) { - return num_elements(s.dims.ff_ordered); +num_bytes_t get_size_in_bytes(TensorShape const &s) { + return num_bytes_t{(get_num_elements(s.dims) * size_of_datatype(s.data_type)) + .nonnegative_int_from_positive_int()}; } -positive_int dim_at_idx(TensorShape const &s, relative_ff_dim_t idx) { - return dim_at_idx(s.dims, idx); -} - -positive_int &dim_at_idx(TensorShape &s, relative_ff_dim_t idx) { - return dim_at_idx(s.dims, idx); -} - -positive_int get_num_elements(TensorShape const &s) { - return get_num_elements(s.dims); -} - -positive_int get_size_in_bytes(TensorShape const &s) { - return get_num_elements(s) * size_of_datatype(s.data_type); +TensorShape tensor_shape_drop_dims( + TensorShape const &input_shape, + std::function const &should_drop_dim) { + return TensorShape{ + /*dims=*/tensor_dims_drop_dims(input_shape.dims, should_drop_dim), + /*data_type=*/input_shape.data_type, + }; } TensorShape slice_tensor_shape(TensorShape const &shape, diff --git a/lib/op-attrs/test/src/op-attrs/datatype_value.cc b/lib/op-attrs/test/src/op-attrs/datatype_value.cc index 9b0e90b601..140cdaae6f 100644 --- a/lib/op-attrs/test/src/op-attrs/datatype_value.cc +++ b/lib/op-attrs/test/src/op-attrs/datatype_value.cc @@ -1,15 +1,30 @@ #include "op-attrs/datatype_value.h" +#include "test/utils/doctest/fmt/half.h" #include using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("test make_data_type_value") { + SUBCASE("make_half_data_type_value") { + half value = 3.0f; + DataTypeValue data_type_value = make_half_data_type_value(value); + + CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); + CHECK(data_type_value.get() == value); + } + SUBCASE("make_float_data_type_value") { float value = 1.0f; DataTypeValue data_type_value = make_float_data_type_value(value); CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); CHECK_FALSE(data_type_value.has()); CHECK_FALSE(data_type_value.has()); CHECK_FALSE(data_type_value.has()); @@ -22,6 +37,7 @@ TEST_SUITE(FF_TEST_SUITE) { DataTypeValue data_type_value = make_double_data_type_value(value); CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); CHECK_FALSE(data_type_value.has()); CHECK_FALSE(data_type_value.has()); CHECK_FALSE(data_type_value.has()); @@ -34,6 +50,7 @@ TEST_SUITE(FF_TEST_SUITE) { DataTypeValue data_type_value = make_int32_data_type_value(value); CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); CHECK_FALSE(data_type_value.has()); CHECK_FALSE(data_type_value.has()); CHECK_FALSE(data_type_value.has()); @@ -46,6 +63,7 @@ TEST_SUITE(FF_TEST_SUITE) { DataTypeValue data_type_value = make_int64_data_type_value(value); CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); CHECK_FALSE(data_type_value.has()); CHECK_FALSE(data_type_value.has()); CHECK_FALSE(data_type_value.has()); @@ -58,6 +76,7 @@ TEST_SUITE(FF_TEST_SUITE) { DataTypeValue data_type_value = make_bool_data_type_value(value); CHECK(data_type_value.has()); + CHECK_FALSE(data_type_value.has()); CHECK_FALSE(data_type_value.has()); CHECK_FALSE(data_type_value.has()); CHECK_FALSE(data_type_value.has()); @@ -65,4 +84,60 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(data_type_value.get() == value); } } + + TEST_CASE("get_data_type_of_data_type_value") { + SUBCASE("half") { + DataTypeValue input = make_half_data_type_value(0.0); + + DataType result = get_data_type_of_data_type_value(input); + DataType correct = DataType::HALF; + + CHECK(result == correct); + } + + SUBCASE("float") { + DataTypeValue input = make_float_data_type_value(0.0); + + DataType result = get_data_type_of_data_type_value(input); + DataType correct = DataType::FLOAT; + + CHECK(result == correct); + } + + SUBCASE("double") { + DataTypeValue input = make_double_data_type_value(0.0); + + DataType result = get_data_type_of_data_type_value(input); + DataType correct = DataType::DOUBLE; + + CHECK(result == correct); + } + + SUBCASE("int32") { + DataTypeValue input = make_int32_data_type_value(0); + + DataType result = get_data_type_of_data_type_value(input); + DataType correct = DataType::INT32; + + CHECK(result == correct); + } + + SUBCASE("int64") { + DataTypeValue input = make_int64_data_type_value(0); + + DataType result = get_data_type_of_data_type_value(input); + DataType correct = DataType::INT64; + + CHECK(result == correct); + } + + SUBCASE("bool") { + DataTypeValue input = make_bool_data_type_value(false); + + DataType result = get_data_type_of_data_type_value(input); + DataType correct = DataType::BOOL; + + CHECK(result == correct); + } + } } diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/concat.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/concat.cc index d8e04124bc..1743ebb86e 100644 --- a/lib/op-attrs/test/src/op-attrs/ff_ordered/concat.cc +++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/concat.cc @@ -10,7 +10,7 @@ TEST_SUITE(FF_TEST_SUITE) { FFOrdered r_input = FFOrdered{2, 1}; FFOrdered result = concat(l_input, r_input); - FFOrdered correct = {1, 3, 1, 2, 1}; + FFOrdered correct = FFOrdered{1, 3, 1, 2, 1}; CHECK(result == correct); } @@ -29,13 +29,13 @@ TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("concat(std::vector>)") { SUBCASE("inputs have elements") { std::vector> input = { - {1}, - {2, 1}, - {1}, + FFOrdered{1}, + FFOrdered{2, 1}, + FFOrdered{1}, }; FFOrdered result = concat(input); - FFOrdered correct = { + FFOrdered correct = FFOrdered{ 1, 2, 1, @@ -55,10 +55,14 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("inputs are empty") { - std::vector> input = {{}, {}, {}}; + std::vector> input = { + FFOrdered{}, + FFOrdered{}, + FFOrdered{}, + }; FFOrdered result = concat(input); - FFOrdered correct = {}; + FFOrdered correct = FFOrdered{}; CHECK(result == correct); } diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/enumerate.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/enumerate.cc index e1a94e72c3..c8566b6de4 100644 --- a/lib/op-attrs/test/src/op-attrs/ff_ordered/enumerate.cc +++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/enumerate.cc @@ -6,7 +6,7 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("enumerate(FFOrdered)") { - FFOrdered input = {"zero", "one", "two"}; + FFOrdered input = FFOrdered{"zero", "one", "two"}; std::map result = enumerate(input); std::map correct = { diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered_from_map.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered_from_map.cc index 73036d5662..49bc13cf8e 100644 --- a/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered_from_map.cc +++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/ff_ordered_from_map.cc @@ -48,7 +48,7 @@ TEST_SUITE(FF_TEST_SUITE) { }; FFOrdered result = ff_ordered_from_map(m); - FFOrdered correct = {4, 5, 2, 7}; + FFOrdered correct = FFOrdered{4, 5, 2, 7}; CHECK(result == correct); } diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/reversed.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/reversed.cc new file mode 100644 index 0000000000..944248d37b --- /dev/null +++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/reversed.cc @@ -0,0 +1,26 @@ +#include "op-attrs/ff_ordered/reversed.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("reversed(FFOrdered)") { + SUBCASE("non-empty input") { + FFOrdered input = FFOrdered{1, 2, 3, 2}; + + FFOrdered result = reversed(input); + FFOrdered correct = FFOrdered{2, 3, 2, 1}; + + CHECK(result == correct); + } + + SUBCASE("empty input") { + FFOrdered input = {}; + + FFOrdered result = reversed(input); + FFOrdered correct = {}; + + CHECK(result == correct); + } + } +} diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/transform.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/transform.cc index 4bf189ec77..2c5c89db29 100644 --- a/lib/op-attrs/test/src/op-attrs/ff_ordered/transform.cc +++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/transform.cc @@ -18,7 +18,7 @@ TEST_SUITE(FF_TEST_SUITE) { } SUBCASE("input is not empty") { - FFOrdered input = {2, 1, 2, 5}; + FFOrdered input = FFOrdered{2, 1, 2, 5}; FFOrdered result = transform(input, [](int x) { return fmt::to_string(x); }); diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/zip.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/zip.cc index 19167cd0ff..4b14bcd134 100644 --- a/lib/op-attrs/test/src/op-attrs/ff_ordered/zip.cc +++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/zip.cc @@ -6,18 +6,20 @@ using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { TEST_CASE("zip(FFOrdered, FFOrdered)") { - FFOrdered lhs_input = {9, 9, 8, 9}; - FFOrdered rhs_input = {"m", "m", "k", "l", "m"}; + FFOrdered lhs_input = FFOrdered{9, 9, 8, 9}; + FFOrdered rhs_input = + FFOrdered{"m", "m", "k", "l", "m"}; SUBCASE("lhs is longer") { FFOrdered> result = zip(lhs_input, rhs_input); - FFOrdered> correct = { - {9, "m"}, - {9, "m"}, - {8, "k"}, - {9, "l"}, - }; + FFOrdered> correct = + FFOrdered>{ + {9, "m"}, + {9, "m"}, + {8, "k"}, + {9, "l"}, + }; CHECK(result == correct); } @@ -25,12 +27,13 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("rhs is longer") { FFOrdered> result = zip(rhs_input, lhs_input); - FFOrdered> correct = { - {"m", 9}, - {"m", 9}, - {"k", 8}, - {"l", 9}, - }; + FFOrdered> correct = + FFOrdered>{ + {"m", 9}, + {"m", 9}, + {"k", 8}, + {"l", 9}, + }; CHECK(result == correct); } diff --git a/lib/op-attrs/test/src/op-attrs/ff_ordered/zip_with.cc b/lib/op-attrs/test/src/op-attrs/ff_ordered/zip_with.cc new file mode 100644 index 0000000000..d61f709629 --- /dev/null +++ b/lib/op-attrs/test/src/op-attrs/ff_ordered/zip_with.cc @@ -0,0 +1,80 @@ +#include "op-attrs/ff_ordered/zip_with.h" +#include "test/utils/doctest/fmt/pair.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("zip_with(FFOrdered, FFOrdered, F)") { + SUBCASE("result types and input types are all different") { + FFOrdered v1 = FFOrdered{1, 3, 4, 3}; + FFOrdered v2 = + FFOrdered{"aa", "cc", "bb", "dd"}; + + FFOrdered> result = + zip_with(v1, v2, [](int x1, std::string const &x2) { + return std::make_pair(x1, x2); + }); + FFOrdered> correct = + FFOrdered>{ + {1, "aa"}, + {3, "cc"}, + {4, "bb"}, + {3, "dd"}, + }; + + CHECK(result == correct); + } + + SUBCASE("input lengths don't match") { + auto add = [](int x1, int x2) { return x1 + x2; }; + + FFOrdered shorter = FFOrdered{1, 2}; + FFOrdered longer = FFOrdered{1, 3, 5, 7}; + + SUBCASE("first input is shorter") { + FFOrdered result = zip_with(shorter, longer, add); + FFOrdered correct = FFOrdered{1 + 1, 2 + 3}; + + CHECK(result == correct); + } + + SUBCASE("second input is shorter") { + FFOrdered result = zip_with(longer, shorter, add); + FFOrdered correct = FFOrdered{1 + 1, 2 + 3}; + + CHECK(result == correct); + } + } + + SUBCASE("properly handles empty inputs") { + FFOrdered nonempty = FFOrdered{1, 2}; + FFOrdered empty = {}; + + auto throw_err = [](int x1, int x2) -> int { + throw std::runtime_error("error"); + }; + + SUBCASE("first input is empty") { + FFOrdered result = zip_with(empty, nonempty, throw_err); + FFOrdered correct = empty; + + CHECK(result == correct); + } + + SUBCASE("second input is empty") { + FFOrdered result = zip_with(nonempty, empty, throw_err); + FFOrdered correct = empty; + + CHECK(result == correct); + } + + SUBCASE("both inputs are empty") { + FFOrdered result = zip_with(empty, empty, throw_err); + FFOrdered correct = empty; + + CHECK(result == correct); + } + } + } +} diff --git a/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc b/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc index 4ef34c666e..72d499d20e 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc @@ -1,5 +1,6 @@ #include "op-attrs/ops/element_binary.h" #include "op-attrs/parallel_tensor_shape.h" +#include "op-attrs/tensor_dims.h" #include "test/utils/doctest/fmt/expected.h" #include @@ -41,7 +42,7 @@ TEST_SUITE(FF_TEST_SUITE) { SUBCASE("mismatched dim size") { TensorShape incorrect_rhs = input_lhs; - dim_at_idx(incorrect_rhs, relative_ff_dim_t{0}) += 1_p; + dim_at_idx(incorrect_rhs.dims, relative_ff_dim_t{0}) += 1_p; tl::expected result = get_output_shape(attrs, input_lhs, incorrect_rhs); diff --git a/lib/op-attrs/test/src/op-attrs/ops/linear.cc b/lib/op-attrs/test/src/op-attrs/ops/linear.cc index 61934fd1fe..4e0dd149ab 100644 --- a/lib/op-attrs/test/src/op-attrs/ops/linear.cc +++ b/lib/op-attrs/test/src/op-attrs/ops/linear.cc @@ -85,8 +85,8 @@ TEST_SUITE(FF_TEST_SUITE) { TensorShape projection = TensorShape{ TensorDims{ FFOrdered{ - in_channels, out_channels, + in_channels, }, }, DataType::FLOAT, @@ -145,10 +145,10 @@ TEST_SUITE(FF_TEST_SUITE) { auto make_projection = [&](SumDegree o_sum, DiscardCopyDegree o_eq, - positive_int o_inchannel, - positive_int o_outchannel) { + positive_int o_outchannel, + positive_int o_inchannel) { return lift_to_parallel_with_degrees( - projection, o_sum, o_eq, FFOrdered{o_inchannel, o_outchannel}); + projection, o_sum, o_eq, FFOrdered{o_outchannel, o_inchannel}); }; auto make_bias = [&](SumDegree o_sum, @@ -232,8 +232,8 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected correct = make_projection(SumDegree{1_p}, DiscardCopyDegree{input_sum_degree}, - degree, - 1_p); + 1_p, + degree); CHECK(result == correct); } @@ -274,8 +274,8 @@ TEST_SUITE(FF_TEST_SUITE) { tl::expected correct = make_projection(SumDegree{1_p}, DiscardCopyDegree{input_sum_degree}, - 1_p, - degree); + degree, + 1_p); CHECK(result == correct); } diff --git a/lib/op-attrs/test/src/op-attrs/tensor_dims.cc b/lib/op-attrs/test/src/op-attrs/tensor_dims.cc index 7c559cf5a8..fc501873d9 100644 --- a/lib/op-attrs/test/src/op-attrs/tensor_dims.cc +++ b/lib/op-attrs/test/src/op-attrs/tensor_dims.cc @@ -1,10 +1,63 @@ #include "op-attrs/tensor_dims.h" #include "test/utils/doctest/fmt/optional.h" +#include "test/utils/doctest/fmt/unordered_set.h" #include using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("tensor_dims_has_dim") { + SUBCASE("nonempty tensor_dims") { + TensorDims tensor_dims = TensorDims{FFOrdered{6_p, 9_p, 8_p}}; + + SUBCASE("does have dim") { + bool correct = true; + SUBCASE("leading dim") { + ff_dim_t dim = ff_dim_t{0_n}; + + bool result = tensor_dims_has_dim(tensor_dims, dim); + + CHECK(result == correct); + } + + SUBCASE("internal dim") { + ff_dim_t dim = ff_dim_t{1_n}; + + bool result = tensor_dims_has_dim(tensor_dims, dim); + + CHECK(result == correct); + } + + SUBCASE("trailing dim") { + ff_dim_t dim = ff_dim_t{2_n}; + + bool result = tensor_dims_has_dim(tensor_dims, ff_dim_t{1_n}); + + CHECK(result == correct); + } + } + + SUBCASE("dim is too large") { + ff_dim_t dim = ff_dim_t{3_n}; + + bool result = tensor_dims_has_dim(tensor_dims, dim); + bool correct = false; + + CHECK(result == correct); + } + } + + SUBCASE("empty tensor_dims") { + TensorDims tensor_dims = TensorDims{FFOrdered{}}; + ff_dim_t dim = ff_dim_t{0_n}; + + bool result = tensor_dims_has_dim(tensor_dims, dim); + bool correct = false; + + CHECK(result == correct); + } + } + TEST_CASE("tensor_dims_is_broadcastable_to(TensorDims, TensorDims)") { TensorDims goal = TensorDims{FFOrdered{1_p, 1_p, 4_p, 3_p}}; @@ -62,6 +115,39 @@ TEST_SUITE(FF_TEST_SUITE) { } } + TEST_CASE("get_tensor_dims_coord_set") { + SUBCASE("TensorDims is not empty") { + TensorDims input = TensorDims{ + FFOrdered{3_p, 1_p, 2_p}, + }; + + std::unordered_set result = + get_tensor_dims_coord_set(input); + std::unordered_set correct = { + TensorDimsCoord{FFOrdered{0_n, 0_n, 0_n}}, + TensorDimsCoord{FFOrdered{0_n, 0_n, 1_n}}, + TensorDimsCoord{FFOrdered{1_n, 0_n, 0_n}}, + TensorDimsCoord{FFOrdered{1_n, 0_n, 1_n}}, + TensorDimsCoord{FFOrdered{2_n, 0_n, 0_n}}, + TensorDimsCoord{FFOrdered{2_n, 0_n, 1_n}}, + }; + + CHECK(result == correct); + } + + SUBCASE("TensorDims is zero-dimensional") { + TensorDims input = TensorDims{FFOrdered{}}; + + std::unordered_set result = + get_tensor_dims_coord_set(input); + std::unordered_set correct = { + TensorDimsCoord{FFOrdered{}}, + }; + + CHECK(result == correct); + } + } + TEST_CASE("get_broadcast_target_dims(std::unordered_set)") { TensorDims d1 = TensorDims{FFOrdered{1_p, 10_p, 4_p, 3_p}}; @@ -119,4 +205,47 @@ TEST_SUITE(FF_TEST_SUITE) { CHECK(result == correct); } } + + TEST_CASE("tensor_dims_drop_dims") { + TensorDims dims = TensorDims{ + FFOrdered{3_p, 5_p, 1_p, 2_p}, + }; + + SUBCASE("removes dims specified to be dropped") { + std::function should_drop_dim = [](ff_dim_t d) { + return d.value % 2_n == 0_n; + }; + + TensorDims result = tensor_dims_drop_dims(dims, should_drop_dim); + TensorDims correct = TensorDims{ + FFOrdered{5_p, 2_p}, + }; + + CHECK(result == correct); + } + + SUBCASE( + "is identity function if no dimensions are specified to be dropped") { + std::function should_drop_dim = [](ff_dim_t d) { + return false; + }; + + TensorDims result = tensor_dims_drop_dims(dims, should_drop_dim); + TensorDims correct = dims; + + CHECK(result == correct); + } + + SUBCASE( + "returns empty dims if all dimensions are specified to be dropped") { + std::function should_drop_dim = [](ff_dim_t d) { + return true; + }; + + TensorDims result = tensor_dims_drop_dims(dims, should_drop_dim); + TensorDims correct = TensorDims{FFOrdered{}}; + + CHECK(result == correct); + } + } } diff --git a/lib/kernels/test/src/kernels/array_coord.cc b/lib/op-attrs/test/src/op-attrs/tensor_dims_coord.cc similarity index 59% rename from lib/kernels/test/src/kernels/array_coord.cc rename to lib/op-attrs/test/src/op-attrs/tensor_dims_coord.cc index bbb503caf1..bb24bfd059 100644 --- a/lib/kernels/test/src/kernels/array_coord.cc +++ b/lib/op-attrs/test/src/op-attrs/tensor_dims_coord.cc @@ -1,11 +1,11 @@ -#include "kernels/array_coord.h" +#include "op-attrs/tensor_dims_coord.h" #include using namespace ::FlexFlow; TEST_SUITE(FF_TEST_SUITE) { - TEST_CASE("array_coord_drop_dims") { - ArrayCoord coord = ArrayCoord{ + TEST_CASE("tensor_dims_coord_drop_dims") { + TensorDimsCoord coord = TensorDimsCoord{ FFOrdered{3_n, 5_n, 0_n, 1_n}, }; @@ -14,8 +14,9 @@ TEST_SUITE(FF_TEST_SUITE) { return d.value % 2_n == 0_n; }; - ArrayCoord result = array_coord_drop_dims(coord, should_drop_dim); - ArrayCoord correct = ArrayCoord{ + TensorDimsCoord result = + tensor_dims_coord_drop_dims(coord, should_drop_dim); + TensorDimsCoord correct = TensorDimsCoord{ FFOrdered{5_n, 1_n}, }; @@ -28,8 +29,9 @@ TEST_SUITE(FF_TEST_SUITE) { return false; }; - ArrayCoord result = array_coord_drop_dims(coord, should_drop_dim); - ArrayCoord correct = coord; + TensorDimsCoord result = + tensor_dims_coord_drop_dims(coord, should_drop_dim); + TensorDimsCoord correct = coord; CHECK(result == correct); } @@ -40,8 +42,9 @@ TEST_SUITE(FF_TEST_SUITE) { return true; }; - ArrayCoord result = array_coord_drop_dims(coord, should_drop_dim); - ArrayCoord correct = ArrayCoord{FFOrdered{}}; + TensorDimsCoord result = + tensor_dims_coord_drop_dims(coord, should_drop_dim); + TensorDimsCoord correct = TensorDimsCoord{FFOrdered{}}; CHECK(result == correct); } diff --git a/lib/pcg/include/pcg/cg_operator_plus_signature.struct.toml b/lib/pcg/include/pcg/cg_operator_plus_signature.struct.toml new file mode 100644 index 0000000000..f4714a87c8 --- /dev/null +++ b/lib/pcg/include/pcg/cg_operator_plus_signature.struct.toml @@ -0,0 +1,23 @@ +namespace = "FlexFlow" +name = "CGOperatorPlusSignature" +features = [ + "eq", + "ord", + "hash", + "fmt", + "json", +] + +includes = [ + "op-attrs/computation_graph_op_attrs.dtg.h", + "pcg/cg_operator_tensor_shape_signature.dtg.h", + "", +] + +[[fields]] +name = "op_attrs" +type = "::FlexFlow::ComputationGraphOpAttrs" + +[[fields]] +name = "tensor_shape_signature" +type = "::FlexFlow::CGOperatorTensorShapeSignature" diff --git a/lib/pcg/include/pcg/cg_operator_tensor_shape_signature.h b/lib/pcg/include/pcg/cg_operator_tensor_shape_signature.h new file mode 100644 index 0000000000..3629aaff43 --- /dev/null +++ b/lib/pcg/include/pcg/cg_operator_tensor_shape_signature.h @@ -0,0 +1,20 @@ +#ifndef _FLEXFLOW_LIB_PCG_INCLUDE_PCG_CG_OPERATOR_TENSOR_SHAPE_SIGNATURE_H +#define _FLEXFLOW_LIB_PCG_INCLUDE_PCG_CG_OPERATOR_TENSOR_SHAPE_SIGNATURE_H + +#include "pcg/cg_operator_tensor_shape_signature.dtg.h" +#include "pcg/tensor_role.dtg.h" + +namespace FlexFlow { + +std::vector + tensor_shapes_for_role(CGOperatorTensorShapeSignature const &signature, + TensorRole tensor_role); + +TensorShape tensor_shape_for_role_and_index( + CGOperatorTensorShapeSignature const &signature, + TensorRole tensor_role, + nonnegative_int index); + +} // namespace FlexFlow + +#endif diff --git a/lib/pcg/include/pcg/cg_operator_tensor_shape_signature.struct.toml b/lib/pcg/include/pcg/cg_operator_tensor_shape_signature.struct.toml new file mode 100644 index 0000000000..a2a6c047c6 --- /dev/null +++ b/lib/pcg/include/pcg/cg_operator_tensor_shape_signature.struct.toml @@ -0,0 +1,32 @@ +namespace = "FlexFlow" +name = "CGOperatorTensorShapeSignature" +features = [ + "eq", + "ord", + "hash", + "fmt", + "json", + "rapidcheck", +] + +includes = [ + "op-attrs/tensor_shape.dtg.h", + "", +] + +src_includes = [ + "utils/hash/vector.h", + "utils/fmt/vector.h", +] + +[[fields]] +name = "input_shapes" +type = "std::vector<::FlexFlow::TensorShape>" + +[[fields]] +name = "weight_shapes" +type = "std::vector<::FlexFlow::TensorShape>" + +[[fields]] +name = "output_shapes" +type = "std::vector<::FlexFlow::TensorShape>" diff --git a/lib/pcg/include/pcg/computation_graph_builder.h b/lib/pcg/include/pcg/computation_graph_builder.h index 2be2a54cd8..d90898716f 100644 --- a/lib/pcg/include/pcg/computation_graph_builder.h +++ b/lib/pcg/include/pcg/computation_graph_builder.h @@ -145,7 +145,7 @@ struct ComputationGraphBuilder { std::optional const &name = std::nullopt); tensor_guid_t layer_norm(tensor_guid_t const &input, - std::vector const &axes, + std::set const &axes, bool elementwise_affine, float eps, std::optional const &name = std::nullopt); diff --git a/lib/pcg/include/pcg/file_format/v1/data_type_value.h b/lib/pcg/include/pcg/file_format/v1/data_type_value.h index ec3910aab3..dae0ccb368 100644 --- a/lib/pcg/include/pcg/file_format/v1/data_type_value.h +++ b/lib/pcg/include/pcg/file_format/v1/data_type_value.h @@ -1,7 +1,7 @@ #ifndef _FLEXFLOW_PCG_INCLUDE_PCG_FILE_FORMAT_V1_DATA_TYPE_H #define _FLEXFLOW_PCG_INCLUDE_PCG_FILE_FORMAT_V1_DATA_TYPE_H -#include "utils/fp16.h" +#include "utils/half.h" #include namespace FlexFlow { diff --git a/lib/pcg/include/pcg/optimizer_attrs.h b/lib/pcg/include/pcg/optimizer_attrs.h index 51dd92c23a..5d9ea8d112 100644 --- a/lib/pcg/include/pcg/optimizer_attrs.h +++ b/lib/pcg/include/pcg/optimizer_attrs.h @@ -2,11 +2,12 @@ #define _FLEXFLOW_PCG_OPTIMIZER_ATTRS_H #include "pcg/optimizer_attrs.dtg.h" +#include "utils/nonnegative_int/nonnegative_int.h" namespace FlexFlow { OptimizerAttrs get_optimizer_attrs_for_next_iter(OptimizerAttrs const &old); -int get_num_optimizer_tensors(OptimizerAttrs const &); +nonnegative_int get_num_optimizer_tensors(OptimizerAttrs const &); } // namespace FlexFlow diff --git a/lib/pcg/include/pcg/pcg_operator_plus_signature.struct.toml b/lib/pcg/include/pcg/pcg_operator_plus_signature.struct.toml new file mode 100644 index 0000000000..e827dae891 --- /dev/null +++ b/lib/pcg/include/pcg/pcg_operator_plus_signature.struct.toml @@ -0,0 +1,23 @@ +namespace = "FlexFlow" +name = "PCGOperatorPlusSignature" +features = [ + "eq", + "ord", + "hash", + "fmt", + "json", +] + +includes = [ + "op-attrs/pcg_operator_attrs.dtg.h", + "pcg/pcg_operator_tensor_shape_signature.dtg.h", + "", +] + +[[fields]] +name = "op_attrs" +type = "::FlexFlow::PCGOperatorAttrs" + +[[fields]] +name = "tensor_shape_signature" +type = "::FlexFlow::PCGOperatorTensorShapeSignature" diff --git a/lib/pcg/include/pcg/pcg_operator_tensor_shape_signature.struct.toml b/lib/pcg/include/pcg/pcg_operator_tensor_shape_signature.struct.toml new file mode 100644 index 0000000000..3e99bdde64 --- /dev/null +++ b/lib/pcg/include/pcg/pcg_operator_tensor_shape_signature.struct.toml @@ -0,0 +1,31 @@ +namespace = "FlexFlow" +name = "PCGOperatorTensorShapeSignature" +features = [ + "eq", + "ord", + "hash", + "fmt", + "json", +] + +includes = [ + "op-attrs/parallel_tensor_shape.dtg.h", + "", +] + +src_includes = [ + "utils/hash/vector.h", + "utils/fmt/vector.h", +] + +[[fields]] +name = "input_shapes" +type = "std::vector<::FlexFlow::ParallelTensorShape>" + +[[fields]] +name = "weight_shapes" +type = "std::vector<::FlexFlow::ParallelTensorShape>" + +[[fields]] +name = "output_shapes" +type = "std::vector<::FlexFlow::ParallelTensorShape>" diff --git a/lib/task-spec/include/task-spec/tensor_role.enum.toml b/lib/pcg/include/pcg/tensor_role.enum.toml similarity index 100% rename from lib/task-spec/include/task-spec/tensor_role.enum.toml rename to lib/pcg/include/pcg/tensor_role.enum.toml diff --git a/lib/pcg/src/pcg/cg_operator_tensor_shape_signature.cc b/lib/pcg/src/pcg/cg_operator_tensor_shape_signature.cc new file mode 100644 index 0000000000..90ffb85c9b --- /dev/null +++ b/lib/pcg/src/pcg/cg_operator_tensor_shape_signature.cc @@ -0,0 +1,28 @@ +#include "pcg/cg_operator_tensor_shape_signature.h" + +namespace FlexFlow { + +std::vector + tensor_shapes_for_role(CGOperatorTensorShapeSignature const &signature, + TensorRole tensor_role) { + switch (tensor_role) { + case TensorRole::INPUT: + return signature.input_shapes; + case TensorRole::WEIGHT: + return signature.weight_shapes; + case TensorRole::OUTPUT: + return signature.output_shapes; + default: + PANIC("Unhandled tensor role", tensor_role); + }; +} + +TensorShape tensor_shape_for_role_and_index( + CGOperatorTensorShapeSignature const &signature, + TensorRole tensor_role, + nonnegative_int index) { + return tensor_shapes_for_role(signature, tensor_role) + .at(index.unwrap_nonnegative()); +} + +} // namespace FlexFlow diff --git a/lib/pcg/src/pcg/computation_graph_builder.cc b/lib/pcg/src/pcg/computation_graph_builder.cc index 0a24acc6aa..4feefa713e 100644 --- a/lib/pcg/src/pcg/computation_graph_builder.cc +++ b/lib/pcg/src/pcg/computation_graph_builder.cc @@ -41,6 +41,7 @@ #include "utils/containers/without_nullopts.h" #include "utils/containers/zip_with_strict.h" #include "utils/expected.h" +#include "utils/fmt/set.h" #include "utils/stack_vector/stack_vector_of.h" #include @@ -480,8 +481,8 @@ tensor_guid_t ComputationGraphBuilder::gather( DataType::INT64)); } - GatherAttrs attrs = GatherAttrs{ - ff_dim_t_from_relative_ff_dim_t(dim, num_dims(this->get_shape(input)))}; + GatherAttrs attrs = GatherAttrs{ff_dim_t_from_relative_ff_dim_t( + dim, get_num_dims(this->get_shape(input).dims))}; std::string name = maybe_name.value_or(get_default_name(ComputationGraphOpAttrs{attrs})); @@ -702,7 +703,7 @@ tensor_guid_t ComputationGraphBuilder::concat( std::optional const &maybe_name) { ff_dim_t abs_axis = ff_dim_t_from_relative_ff_dim_t( - axis, num_dims(this->get_shape(inputs.at(0)))); + axis, get_num_dims(this->get_shape(inputs.at(0)).dims)); ConcatAttrs attrs = ConcatAttrs{abs_axis}; @@ -719,7 +720,7 @@ tensor_guid_t ComputationGraphBuilder::flat( relative_ff_dim_t start_dim, std::optional const &end_dim, std::optional const &maybe_name) { - nonnegative_int input_num_dims = num_dims(this->get_shape(input)); + nonnegative_int input_num_dims = get_num_dims(this->get_shape(input).dims); ff_dim_t abs_start_dim = ff_dim_t_from_relative_ff_dim_t(start_dim, input_num_dims); @@ -743,7 +744,7 @@ tensor_guid_t ComputationGraphBuilder::flat( tensor_guid_t ComputationGraphBuilder::layer_norm( tensor_guid_t const &input, - std::vector const &relative_axes, + std::set const &relative_axes, bool elementwise_affine, float eps, std::optional const &maybe_name) { @@ -751,26 +752,26 @@ tensor_guid_t ComputationGraphBuilder::layer_norm( TensorShape input_shape = this->get_shape(input); auto resolve_dim_idx = [&](relative_ff_dim_t dim_idx) { - return ff_dim_t_from_relative_ff_dim_t(dim_idx, num_dims(input_shape)); + return ff_dim_t_from_relative_ff_dim_t(dim_idx, + get_num_dims(input_shape.dims)); }; - stack_vector axes = stack_vector_of( - transform(relative_axes, resolve_dim_idx)); + std::set axes = transform(relative_axes, resolve_dim_idx); if (any_of(axes, [&](ff_dim_t axis) { - return axis.value >= num_dims(input_shape); + return axis.value >= get_num_dims(input_shape.dims); })) { throw mk_runtime_error(fmt::format( "ComputationGraphBuilder::layer_norm received axes {} with " "out-of-bound element (input tensor has num dimensions = {})", axes, - num_dims(input_shape))); + get_num_dims(input_shape.dims))); } LayerNormAttrs attrs = LayerNormAttrs{ - axes, - elementwise_affine, - eps, + /*axes=*/axes, + /*elementwise_affine=*/elementwise_affine, + /*eps=*/eps, }; std::string name = @@ -790,19 +791,16 @@ tensor_guid_t ComputationGraphBuilder::softmax( TensorShape input_shape = this->get_shape(input); - relative_ff_dim_t dim = maybe_dim.value_or( - relative_ff_dim_t{num_dims(input_shape).unwrap_nonnegative() - 1}); + relative_ff_dim_t dim = maybe_dim.value_or(relative_ff_dim_t{ + get_num_dims(input_shape.dims).unwrap_nonnegative() - 1}); - SoftmaxAttrs attrs = - SoftmaxAttrs{ff_dim_t_from_relative_ff_dim_t(dim, num_dims(input_shape))}; + SoftmaxAttrs attrs = SoftmaxAttrs{ + ff_dim_t_from_relative_ff_dim_t(dim, get_num_dims(input_shape.dims))}; - if (attrs.dim.value >= num_dims(input_shape)) { - throw mk_runtime_error( - fmt::format("ComputationGraphBuilder::softmax received out-of-bounds " - "dim {} for input tensor shape {}", - attrs.dim.value, - input_shape)); - } + ASSERT(attrs.dim.value < get_num_dims(input_shape.dims), + "ComputationGraphBuilder::softmax received out_of_bounds dim", + attrs.dim, + input_shape); std::string name = maybe_name.value_or(get_default_name(ComputationGraphOpAttrs{attrs})); diff --git a/lib/pcg/src/pcg/optimizer_attrs.cc b/lib/pcg/src/pcg/optimizer_attrs.cc index 7a37091428..b99fcd600b 100644 --- a/lib/pcg/src/pcg/optimizer_attrs.cc +++ b/lib/pcg/src/pcg/optimizer_attrs.cc @@ -23,16 +23,16 @@ OptimizerAttrs } } -int get_num_optimizer_tensors(OptimizerAttrs const &attrs) { - return attrs.visit( +nonnegative_int get_num_optimizer_tensors(OptimizerAttrs const &attrs) { + return attrs.visit( overload{[&](SGDOptimizerAttrs const &o) { if (o.momentum > 0.0f) { - return 1; + return 1_n; } else { - return 0; + return 0_n; } }, - [&](AdamOptimizerAttrs const &) { return 2; }}); + [&](AdamOptimizerAttrs const &) { return 2_n; }}); } } // namespace FlexFlow diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc index b08c0a575d..052d30df0f 100644 --- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc +++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc @@ -64,13 +64,8 @@ ParallelLayerAddedResult add_parallel_layer( std::vector correct_weight_shapes = get_weight_shapes(layer_attrs.op_attrs, input_shapes); - if (weight_shapes != correct_weight_shapes) { - throw mk_runtime_error( - fmt::format("add_parallel_layer expected weight shapes {}, but " - "received weights with shapes {}", - correct_weight_shapes, - weight_shapes)); - } + ASSERT(weight_shapes == correct_weight_shapes, + "add_parallel_layer received incorrect weight shapes"); std::vector output_shapes = get_output_shapes(layer_attrs.op_attrs, input_shapes); diff --git a/lib/realm-backend/CMakeLists.txt b/lib/realm-backend/CMakeLists.txt index 623816567e..a325e14955 100644 --- a/lib/realm-backend/CMakeLists.txt +++ b/lib/realm-backend/CMakeLists.txt @@ -11,6 +11,7 @@ ff_add_library( op-attrs utils kernels + compiler local-execution pcg spdlog diff --git a/lib/realm-backend/include/realm-backend/model_training_instance.h b/lib/realm-backend/include/realm-backend/model_training_instance.h index b1580b0305..e95b4c81ea 100644 --- a/lib/realm-backend/include/realm-backend/model_training_instance.h +++ b/lib/realm-backend/include/realm-backend/model_training_instance.h @@ -4,29 +4,24 @@ #include "realm-backend/realm_training_backing.h" #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" #include "pcg/tensor_guid_t.dtg.h" -#include "task-spec/loss_tensor_t.dtg.h" +#include "task-spec/loss_tensor_guid_t.dtg.h" namespace FlexFlow { -using PerLayerElapsedTime = - std::unordered_map>; - struct ModelTrainingInstance { - ModelTrainingInstance(RealmTrainingBacking const &, - tensor_guid_t const &logit_tensor, - loss_tensor_t const &label_tensor, + ModelTrainingInstance(RealmRuntimeState &, + LocalTrainingBacking const &, LossAttrs const &, OptimizerAttrs const &); - RealmTrainingBacking training_backing; - tensor_guid_t logit_tensor; - loss_tensor_t label_tensor; + RealmRuntimeState &runtime_state; + LocalTrainingBacking training_backing; LossAttrs loss_attrs; OptimizerAttrs optimizer_attrs; public: - PerLayerElapsedTime forward(); - PerLayerElapsedTime backward(); + std::unordered_map> forward(); + std::unordered_map> backward(); void update(); GenericTensorAccessorR get_loss_tensor_accessor() const; }; diff --git a/lib/realm-backend/include/realm-backend/realm_args_backing.h b/lib/realm-backend/include/realm-backend/realm_args_backing.h deleted file mode 100644 index 75f954c0ad..0000000000 --- a/lib/realm-backend/include/realm-backend/realm_args_backing.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef _FLEXFLOW_REALM_BACKEND_REALM_ARGS_BACKING_H -#define _FLEXFLOW_REALM_BACKEND_REALM_ARGS_BACKING_H - -#include "pcg/computation_graph.h" -#include "pcg/layer_guid_t.dtg.h" -#include "realm-backend/realm_task_argument_accessor.h" -#include "realm-backend/task_result.h" -#include "task-spec/op_task_invocation.h" -#include "task-spec/per_device_op_state.h" -#include "task-spec/runtime_arg_config.h" -#include "task-spec/task_invocation.dtg.h" - -namespace FlexFlow { - -struct RealmArgsBacking { - RealmArgsBacking(RuntimeArgConfig const &, - std::unordered_map const &); - -public: - // arguments - RuntimeArgConfig runtime_arg_config; - std::unordered_map - per_device_op_states; -}; - -RealmArgsBacking -make_args_backing_with_empty_device_states(RuntimeArgConfig const &); - -std::optional -get_per_device_op_state_if_exists(RealmArgsBacking const &, - layer_guid_t const &); - -ArgSlotsBacking construct_arg_slots_backing(TaskBinding const &, - RuntimeArgConfig const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h b/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h deleted file mode 100644 index 0e83a3de6f..0000000000 --- a/lib/realm-backend/include/realm-backend/realm_task_argument_accessor.h +++ /dev/null @@ -1,47 +0,0 @@ -#ifndef _FLEXFLOW_REALM_BACKEND_REALM_TASK_ARGUMENT_ACCESSOR_H -#define _FLEXFLOW_REALM_BACKEND_REALM_TASK_ARGUMENT_ACCESSOR_H - -#include "realm-backend/realm_allocator.h" -#include "task-spec/slot_tensor_type_id.dtg.h" -#include "task-spec/task_argument_accessor.h" -#include -#include - -namespace FlexFlow { - -using TensorSlotsBacking = std::unordered_map< - SlotTensorTypeId, - std::variant>>; -using ArgSlotsBacking = std::unordered_map; - -struct RealmTaskArgumentAccessor : public ITaskArgumentAccessor { - RealmTaskArgumentAccessor(Allocator const &allocator, - TensorSlotsBacking const &tensor_slots_backing, - ArgSlotsBacking const &arg_slots_backing); - - RealmTaskArgumentAccessor(RealmTaskArgumentAccessor const &) = delete; - RealmTaskArgumentAccessor(RealmTaskArgumentAccessor &&) = delete; - - ConcreteArgSpec const &get_concrete_arg(slot_id_t) const override; - - GenericTensorAccessor get_tensor(slot_id_t slot, Permissions priv, - TensorType tensor_type) const override; - VariadicGenericTensorAccessor - get_variadic_tensor(slot_id_t slot, Permissions priv, - TensorType tensor_type) const override; - - Allocator get_allocator() const override; - - size_t get_device_idx() const override; - -private: - Allocator allocator; - TensorSlotsBacking tensor_slots_backing; - ArgSlotsBacking arg_slots_backing; -}; - -CHECK_RC_COPY_VIRTUAL_COMPLIANT(RealmTaskArgumentAccessor); - -} // namespace FlexFlow - -#endif diff --git a/lib/realm-backend/include/realm-backend/realm_tensor_backing.h b/lib/realm-backend/include/realm-backend/realm_tensor_backing.h deleted file mode 100644 index b38815ffee..0000000000 --- a/lib/realm-backend/include/realm-backend/realm_tensor_backing.h +++ /dev/null @@ -1,47 +0,0 @@ - -#ifndef _FLEXFLOW_REALM_BACKEND_REALM_TENSOR_BACKING_H -#define _FLEXFLOW_REALM_BACKEND_REALM_TENSOR_BACKING_H - -#include "kernels/accessor.h" -#include "local-execution/allocated_tensors.dtg.h" -#include "local-execution/gradient_tensor_source.h" -#include "local-execution/loss_tensor_source.h" -#include "local-execution/optimizer_tensor_source.h" -#include "local-execution/unallocated_tensors.dtg.h" -#include "pcg/computation_graph.dtg.h" -#include "pcg/layer_guid_t.dtg.h" -#include "pcg/optimizer_attrs.dtg.h" -#include "realm-backend/realm_allocator.h" -#include "realm-backend/realm_task_argument_accessor.h" -#include "realm-backend/realm_tensor_backing.dtg.h" -#include "task-spec/lowered_tensor_t.dtg.h" -#include "task-spec/task_invocation.dtg.h" -#include "task-spec/tensor_role.dtg.h" -namespace FlexFlow { - - GenericTensorAccessorW get_tensor(RealmTensorBacking const &, - TensorTypeVariant const &); - - std::unordered_map - get_tensor_backings( - std::unordered_map const &, - std::unordered_map const &, - Allocator &); - - std::unordered_map> - merge_optimizer_mappings( - std::unordered_map> const - &allocated, - std::unordered_map> const - &unallocated); - - RealmTensorBacking construct_realm_tensor_backing(AllocatedTensors const &, - UnallocatedTensors const &, - Allocator &); - - TensorSlotsBacking construct_tensor_slots_backing(RealmTensorBacking const &, - TaskBinding const &); - - } // namespace FlexFlow - - #endif \ No newline at end of file diff --git a/lib/realm-backend/include/realm-backend/realm_tensor_backing.struct.toml b/lib/realm-backend/include/realm-backend/realm_tensor_backing.struct.toml deleted file mode 100644 index d53071dd0e..0000000000 --- a/lib/realm-backend/include/realm-backend/realm_tensor_backing.struct.toml +++ /dev/null @@ -1,31 +0,0 @@ -namespace = "FlexFlow" -name = "RealmTensorBacking" -features = [ - "eq", - "fmt", -] - -includes = [ - "task-spec/tensor_type_t.dtg.h", - "kernels/accessor.h", - "pcg/tensor_guid_t.dtg.h", - "task-spec/gradient_tensor_t.dtg.h", - "task-spec/optimizer_tensor_t.dtg.h", -] - -src_includes = [ - "utils/fmt/unordered_map.h", - "utils/fmt/vector.h", -] - -[[fields]] -name = "tensor_backings" -type = "std::unordered_map<::FlexFlow::TensorTypeVariant, ::FlexFlow::GenericTensorAccessorW>" - -[[fields]] -name = "tensor_gradient_mapping" -type = "std::unordered_map<::FlexFlow::tensor_guid_t, ::FlexFlow::gradient_tensor_t>" - -[[fields]] -name = "tensor_optimizer_mapping" -type = "std::unordered_map<::FlexFlow::tensor_guid_t, std::vector<::FlexFlow::optimizer_tensor_t>>" \ No newline at end of file diff --git a/lib/realm-backend/include/realm-backend/realm_training_backing.h b/lib/realm-backend/include/realm-backend/realm_training_backing.h index 8fe842daf6..57fc7147ce 100644 --- a/lib/realm-backend/include/realm-backend/realm_training_backing.h +++ b/lib/realm-backend/include/realm-backend/realm_training_backing.h @@ -1,79 +1,63 @@ #ifndef _FLEXFLOW_REALM_BACKEND_REALM_TRAINING_BACKING_H #define _FLEXFLOW_REALM_BACKEND_REALM_TRAINING_BACKING_H -#include "local-execution/optimizer_tensor_source.h" -#include "local-execution/task_registry.h" +#include "local-execution/local_training_backing.dtg.h" #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" -#include "pcg/computation_graph.dtg.h" #include "pcg/optimizer_attrs.dtg.h" -#include "local-execution/allocated_tensors.h" -#include "local-execution/unallocated_tensors.h" +#include "task-spec/training_computation_graph.dtg.h" +#include "task-spec/training_tensor_guid_t.dtg.h" +#include "utils/containers/generate_map.h" +#include "utils/units/milliseconds_t.h" #include "realm-backend/driver.h" #include "realm-backend/realm_allocator.h" -#include "realm-backend/realm_args_backing.h" -#include "realm-backend/realm_tensor_backing.h" #include "realm-backend/task_wrapper.h" namespace FlexFlow { -using PerLayerElapsedTime = - std::unordered_map>; - -struct RealmTrainingBacking { - RealmTrainingBacking(Realm::Processor, - std::vector const &, - std::vector const &, - AllocatedTensors const &, - GradientTensorSource &, - ComputationGraph const &, RuntimeArgConfig const &); - - RealmTrainingBacking(Realm::Processor, - std::vector const &, - std::vector const &, - AllocatedTensors const &, - GradientTensorSource &, - OptimizerTensorSource &, - ComputationGraph const &, RuntimeArgConfig const &, - OptimizerAttrs const &); - -public: - // runtime +struct RealmRuntimeState { Realm::Processor master_proc; Realm::Event master_event; Realm::Memory master_mem; std::vector worker_procs; std::vector worker_events; std::vector allocators; +}; - ComputationGraph computation_graph; - TaskRegistry task_registry; +LocalTrainingBacking make_realm_training_backing_for_computation_graph( + RealmRuntimeState &runtime_state, + std::unordered_map const + &preallocated_tensors, + TrainingComputationGraph const &training_computation_graph, + RuntimeArgConfig const &runtime_arg_config, + OptimizerAttrs const &optimizer_attrs); - RealmTensorBacking realm_tensor_backing; - RealmArgsBacking realm_args_backing; -}; +void register_tasks_for_realm(LocalTaskRegistry const &, RealmRuntimeState &); + +std::optional + create_per_device_op_state(LocalTaskRegistry const &, + LocalTensorBacking const &, + RuntimeArgConfig const &, + RealmRuntimeState &, + TrainingLayerPlusContext const &); -TaskRegistry construct_task_registry_and_register_tasks_for_realm( - ComputationGraph const &, std::vector const &); +Future> execute_forward(LocalTaskRegistry const &, + LocalTensorBacking const &, + LocalArgsBacking const &, + TrainingLayerPlusContext const &, + RealmRuntimeState &); -RealmArgsBacking initialize_args_backing(RealmTrainingBacking *, - ComputationGraph const &, - RuntimeArgConfig const &); +Future> execute_backward(LocalTaskRegistry const &, + LocalTensorBacking const &, + LocalArgsBacking const &, + TrainingLayerPlusContext const &, + RealmRuntimeState &); -void execute_init(RealmTrainingBacking &, layer_guid_t const &); -Future execute_forward(RealmTrainingBacking &, - layer_guid_t const &); -Future execute_backward(RealmTrainingBacking &, - layer_guid_t const &); -Future compute_loss(RealmTrainingBacking &, LossAttrs const &, - tensor_guid_t const &logit_tensor, - loss_tensor_t const &label_tensor); -Future execute_update(RealmTrainingBacking &, layer_guid_t const &, - OptimizerAttrs const &); +Future compute_loss(LocalTrainingBacking const &, LossAttrs const &, RealmRuntimeState &); -TaskArgumentAccessor get_task_arg_accessor(RealmTensorBacking const &, - RealmArgsBacking const &, - TaskInvocation const &, - Allocator &); +Future execute_update(LocalTrainingBacking const &, + layer_guid_t const &, + OptimizerAttrs const &, + RealmRuntimeState &); } // namespace FlexFlow diff --git a/lib/realm-backend/include/realm-backend/task_result.h b/lib/realm-backend/include/realm-backend/task_result.h index d869982563..46e5f89274 100644 --- a/lib/realm-backend/include/realm-backend/task_result.h +++ b/lib/realm-backend/include/realm-backend/task_result.h @@ -2,7 +2,6 @@ #define _FLEXFLOW_LOCAL_EXECUTION_TASK_RESULT_H #include "realm-backend/driver.h" -#include "realm-backend/realm_task_argument_accessor.h" #include #include diff --git a/lib/realm-backend/include/realm-backend/task_wrapper.h b/lib/realm-backend/include/realm-backend/task_wrapper.h index 64a360e549..fa6c9f0ed3 100644 --- a/lib/realm-backend/include/realm-backend/task_wrapper.h +++ b/lib/realm-backend/include/realm-backend/task_wrapper.h @@ -1,8 +1,7 @@ #ifndef _FLEXFLOW_REALM_BACKEND_TASK_WRAPPER_H #define _FLEXFLOW_REALM_BACKEND_TASK_WRAPPER_H -#include "local-execution/task_registry.h" -#include "realm-backend/realm_task_argument_accessor.h" +#include "local-execution/local_task_registry.h" #include "realm-backend/task_result.h" namespace FlexFlow { diff --git a/lib/realm-backend/src/model_training_instance.cc b/lib/realm-backend/src/model_training_instance.cc index 87b8121bd5..a7d359b638 100644 --- a/lib/realm-backend/src/model_training_instance.cc +++ b/lib/realm-backend/src/model_training_instance.cc @@ -1,98 +1,114 @@ #include "pcg/computation_graph.h" #include "pcg/optimizer_attrs.h" #include "realm-backend/model_training_instance.h" -#include "kernels/format_accessor_contents.h" +#include "task-spec/training_computation_graph.h" #include "utils/containers/reversed.h" namespace FlexFlow { ModelTrainingInstance::ModelTrainingInstance( - RealmTrainingBacking const &realm_training_backing, - tensor_guid_t const &logit_tensor, - loss_tensor_t const &label_tensor, + RealmRuntimeState &runtime_state, + LocalTrainingBacking const &local_training_backing, LossAttrs const &loss_attrs, OptimizerAttrs const &optimizer_attrs) - : training_backing(realm_training_backing), loss_attrs(loss_attrs), - optimizer_attrs(optimizer_attrs), logit_tensor(logit_tensor), - label_tensor(label_tensor){}; + : runtime_state(runtime_state), training_backing(local_training_backing), + loss_attrs(loss_attrs), optimizer_attrs(optimizer_attrs) {} -PerLayerElapsedTime ModelTrainingInstance::forward() { - PerLayerElapsedTime per_layer_elapsed_time; - std::unordered_map> +std::unordered_map> + ModelTrainingInstance::forward() { + + std::unordered_map> + per_layer_elapsed_time; + std::unordered_map>> per_layer_elapsed_time_future; - for (layer_guid_t const &node : topological_ordering( - this->training_backing.computation_graph)) { + + for (layer_guid_t const &layer_guid : + topological_ordering(this->training_backing.training_computation_graph + .computation_graph)) { per_layer_elapsed_time_future.insert( - {node, execute_forward(this->training_backing, node)}); + {layer_guid, + execute_forward( + this->training_backing.local_task_registry, + this->training_backing.local_tensor_backing, + this->training_backing.local_args_backing, + get_training_layer_plus_context( + this->training_backing.training_computation_graph, layer_guid), + this->runtime_state) + }); } - for (layer_guid_t const &node : topological_ordering( - this->training_backing.computation_graph)) { - float elapsed_time = - per_layer_elapsed_time_future[node].get(); - per_layer_elapsed_time.insert({node, elapsed_time}); + + for (layer_guid_t const &layer_guid : topological_ordering( + this->training_backing.training_computation_graph + .computation_graph)) { + std::optional elapsed_time = + per_layer_elapsed_time_future[layer_guid].get(); + per_layer_elapsed_time.insert({layer_guid, elapsed_time}); } return per_layer_elapsed_time; } -PerLayerElapsedTime ModelTrainingInstance::backward() { - compute_loss(this->training_backing, - this->loss_attrs, - this->logit_tensor, - this->label_tensor); - - gradient_tensor_t loss_tensor = - this->training_backing.realm_tensor_backing.tensor_gradient_mapping.at( - this->logit_tensor); - GenericTensorAccessorW loss_tensor_backing = - this->training_backing.realm_tensor_backing.tensor_backings.at( - TensorTypeVariant{loss_tensor}); +std::unordered_map> + ModelTrainingInstance::backward() { + compute_loss(this->training_backing, this->loss_attrs, this->runtime_state); - PerLayerElapsedTime per_layer_elapsed_time; - std::unordered_map> + std::unordered_map> + per_layer_elapsed_time; + std::unordered_map>> per_layer_elapsed_time_future; - for (layer_guid_t const &node : reversed(topological_ordering( - this->training_backing.computation_graph))) { + + for (layer_guid_t const &layer_guid : reversed(topological_ordering( + this->training_backing.training_computation_graph + .computation_graph))) { per_layer_elapsed_time_future.insert( - {node, execute_backward(this->training_backing, node)}); + {layer_guid, + execute_backward( + this->training_backing.local_task_registry, + this->training_backing.local_tensor_backing, + this->training_backing.local_args_backing, + get_training_layer_plus_context( + this->training_backing.training_computation_graph, layer_guid), + this->runtime_state) + }); } - for (layer_guid_t const &node : reversed(topological_ordering( - this->training_backing.computation_graph))) { - float elapsed_time = - per_layer_elapsed_time_future[node].get(); - per_layer_elapsed_time.insert({node, elapsed_time}); + + for (layer_guid_t const &layer_guid : reversed(topological_ordering( + this->training_backing.training_computation_graph + .computation_graph))) { + std::optional elapsed_time = + per_layer_elapsed_time_future[layer_guid].get(); + per_layer_elapsed_time.insert({layer_guid, elapsed_time}); } return per_layer_elapsed_time; } void ModelTrainingInstance::update() { std::unordered_map> per_layer_update_future; - for (layer_guid_t const &node : topological_ordering( - this->training_backing.computation_graph)) { + for (layer_guid_t const &layer_guid : topological_ordering( + this->training_backing.training_computation_graph + .computation_graph)) { per_layer_update_future.insert( - {node, execute_update(this->training_backing, - node, - this->optimizer_attrs)}); + {layer_guid, execute_update(this->training_backing, + layer_guid, + this->optimizer_attrs, + this->runtime_state)}); } - for (layer_guid_t const &node : topological_ordering( - this->training_backing.computation_graph)) { - per_layer_update_future[node].wait(); + for (layer_guid_t const &layer_guid : topological_ordering( + this->training_backing.training_computation_graph + .computation_graph)) { + per_layer_update_future[layer_guid].wait(); } this->optimizer_attrs = get_optimizer_attrs_for_next_iter( this->optimizer_attrs); } GenericTensorAccessorR ModelTrainingInstance::get_loss_tensor_accessor() const { - GenericTensorAccessorW logit_tensor_backing = this->training_backing - .realm_tensor_backing.tensor_backings.at(TensorTypeVariant{this->logit_tensor}); - - - gradient_tensor_t loss_tensor = - this->training_backing.realm_tensor_backing.tensor_gradient_mapping.at( - this->logit_tensor); + gradient_tensor_guid_t loss_tensor = get_gradient_tensor_guid_for_tensor_guid( + this->training_backing.training_computation_graph, + this->training_backing.training_computation_graph.logit_tensor); GenericTensorAccessorW loss_tensor_backing = - this->training_backing.realm_tensor_backing.tensor_backings.at( - TensorTypeVariant{loss_tensor}); - + this->training_backing.local_tensor_backing + .backing_for_training_tensor_map.at( + training_tensor_guid_t{loss_tensor}); return read_only_accessor_from_write_accessor(loss_tensor_backing); } diff --git a/lib/realm-backend/src/realm_args_backing.cc b/lib/realm-backend/src/realm_args_backing.cc deleted file mode 100644 index d30793a801..0000000000 --- a/lib/realm-backend/src/realm_args_backing.cc +++ /dev/null @@ -1,46 +0,0 @@ -#include "op-attrs/parallel_tensor_shape.h" -#include "realm-backend/realm_args_backing.h" -#include "task-spec/op_task_to_task_invocation.h" -#include "utils/containers/contains_key.h" -#include "utils/containers/map_values.h" -#include "utils/overload.h" - -namespace FlexFlow { - -RealmArgsBacking make_args_backing_with_empty_device_states( - RuntimeArgConfig const &runtime_arg_config) { -return RealmArgsBacking{runtime_arg_config, {}}; -} - -RealmArgsBacking::RealmArgsBacking( - RuntimeArgConfig const &runtime_arg_config, - std::unordered_map const - &device_states) - : runtime_arg_config(runtime_arg_config), - per_device_op_states(device_states){}; - -std::optional get_per_device_op_state_if_exists( - RealmArgsBacking const &realm_args_backing, - layer_guid_t const &layer_guid) { - if (contains_key(realm_args_backing.per_device_op_states, layer_guid)) { - return realm_args_backing.per_device_op_states.at(layer_guid); - } else { - return std::nullopt; - } -} - -ArgSlotsBacking - construct_arg_slots_backing(TaskBinding const &binding, - RuntimeArgConfig const &runtime_arg_config) { - return map_values( - binding.get_arg_bindings(), [&](TaskArgSpec const &arg_binding) { - return arg_binding.template visit( - overload{[&](RuntimeArgRefSpec const &s) { - return lower_to_concrete_arg_spec(s, runtime_arg_config); - }, - [](ConcreteArgSpec const &s) { return s; }}); - }); - ; -} - -} // namespace FlexFlow diff --git a/lib/realm-backend/src/realm_task_argument_accessor.cc b/lib/realm-backend/src/realm_task_argument_accessor.cc deleted file mode 100644 index b7f10772e0..0000000000 --- a/lib/realm-backend/src/realm_task_argument_accessor.cc +++ /dev/null @@ -1,65 +0,0 @@ -#include "realm-backend/realm_task_argument_accessor.h" -#include "utils/containers/contains_key.h" -#include "utils/containers/transform.h" -#include "utils/hash/pair.h" -#include "utils/overload.h" - -namespace FlexFlow { - -RealmTaskArgumentAccessor::RealmTaskArgumentAccessor( - Allocator const &allocator, - TensorSlotsBacking const &tensor_slots_backing, - ArgSlotsBacking const &arg_slots_backing) - : allocator(allocator), tensor_slots_backing(tensor_slots_backing), - arg_slots_backing(arg_slots_backing){}; - -ConcreteArgSpec const & - RealmTaskArgumentAccessor::get_concrete_arg(slot_id_t name) const { - return this->arg_slots_backing.at(name); -} - -GenericTensorAccessor RealmTaskArgumentAccessor::get_tensor( - slot_id_t slot, Permissions priv, TensorType tensor_type) const { - SlotTensorTypeId slot_tensor_type = SlotTensorTypeId{slot, tensor_type}; - auto tensor_backing = std::get( - this->tensor_slots_backing.at(slot_tensor_type)); - if (priv == Permissions::RO) { - GenericTensorAccessorR readonly_tensor_backing = - read_only_accessor_from_write_accessor(tensor_backing); - return readonly_tensor_backing; - } else if (priv == Permissions::RW || priv == Permissions::WO) { - return tensor_backing; - } else { - throw mk_runtime_error(fmt::format("Unhandled privilege mode {}", priv)); - } -} - -VariadicGenericTensorAccessor RealmTaskArgumentAccessor::get_variadic_tensor( - slot_id_t slot, Permissions priv, TensorType tensor_type) const { - SlotTensorTypeId slot_tensor_type = SlotTensorTypeId{slot, tensor_type}; - auto variadic_tensor_backing = std::get>( - this->tensor_slots_backing.at(slot_tensor_type)); - if (priv == Permissions::RO) { - std::vector readonly_variadic_tensor_backing = {}; - for (GenericTensorAccessorW const &tensor_backing : - variadic_tensor_backing) { - readonly_variadic_tensor_backing.push_back( - read_only_accessor_from_write_accessor(tensor_backing)); - } - return readonly_variadic_tensor_backing; - } else if (priv == Permissions::RW || priv == Permissions::WO) { - return variadic_tensor_backing; - } else { - throw mk_runtime_error(fmt::format("Unhandled privilege mode {}", priv)); - } -} - -Allocator RealmTaskArgumentAccessor::get_allocator() const { - return this->allocator; -} - -size_t RealmTaskArgumentAccessor::get_device_idx() const { - return 0; -} - -} // namespace FlexFlow diff --git a/lib/realm-backend/src/realm_tensor_backing.cc b/lib/realm-backend/src/realm_tensor_backing.cc deleted file mode 100644 index 5dcfa8cef8..0000000000 --- a/lib/realm-backend/src/realm_tensor_backing.cc +++ /dev/null @@ -1,94 +0,0 @@ -#include "op-attrs/parallel_tensor_shape.h" -#include "pcg/computation_graph.h" -#include "pcg/optimizer_attrs.h" -#include "realm-backend/realm_tensor_backing.h" -#include "task-spec/slot_grad_id.dtg.h" -#include "utils/containers/contains_key.h" -#include "utils/containers/keys.h" -#include "utils/overload.h" - -namespace FlexFlow { - -GenericTensorAccessorW -get_tensor(RealmTensorBacking const &realm_tensor_backing, - TensorTypeVariant const &tensor_type) { - return realm_tensor_backing.tensor_backings.at(tensor_type); -} - -std::unordered_map> -merge_optimizer_mappings( - std::unordered_map> const - &allocated, - std::unordered_map> const - &unallocated) { - std::unordered_map> - merged_maps = allocated; - for (std::pair> const - &unallocated_optimizer_tensors : unallocated) { - if (merged_maps.count(unallocated_optimizer_tensors.first)) { - for (optimizer_tensor_t const &optimizer_tensor : - unallocated_optimizer_tensors.second) { - merged_maps[unallocated_optimizer_tensors.first].push_back( - optimizer_tensor); - } - } else { - merged_maps.insert({unallocated_optimizer_tensors}); - } - } - return merged_maps; -} - -std::unordered_map -get_tensor_backings( - std::unordered_map const - &tensor_type_backings, - std::unordered_map const - &tensor_type_shapes, - Allocator &allocator) { - std::unordered_map - all_tensor_backings = tensor_type_backings; - - // allocate new tensors - for (std::pair const &tensor_type_shape : - tensor_type_shapes) { - GenericTensorAccessorW tensor_backing = - allocator.allocate_tensor(tensor_type_shape.second); - all_tensor_backings.insert({tensor_type_shape.first, tensor_backing}); - } - - return all_tensor_backings; -} - -RealmTensorBacking -construct_realm_tensor_backing(AllocatedTensors const &allocated_tensors, - UnallocatedTensors const &unallocated_tensors, - Allocator &allocator) { - - std::unordered_map merged_gradient_maps = - allocated_tensors.gradient_mapping; - merged_gradient_maps.insert(unallocated_tensors.gradient_mapping.begin(), - unallocated_tensors.gradient_mapping.end()); - - return RealmTensorBacking{ - get_tensor_backings(allocated_tensors.tensor_type_backings, - unallocated_tensors.tensor_type_shapes, allocator), - merged_gradient_maps, - merge_optimizer_mappings(allocated_tensors.optimizer_mapping, - unallocated_tensors.optimizer_mapping)}; -} - -TensorSlotsBacking -construct_tensor_slots_backing(RealmTensorBacking const &realm_tensor_backing, - TaskBinding const &binding) { - TensorSlotsBacking mapping; - - for (std::pair const &tensor_binding : - binding.get_tensor_bindings()) { - mapping.insert({tensor_binding.first, - get_tensor(realm_tensor_backing, tensor_binding.second)}); - } - - return mapping; -} - -} // namespace FlexFlow \ No newline at end of file diff --git a/lib/realm-backend/src/realm_training_backing copy.cc b/lib/realm-backend/src/realm_training_backing copy.cc new file mode 100644 index 0000000000..e6a3079a25 --- /dev/null +++ b/lib/realm-backend/src/realm_training_backing copy.cc @@ -0,0 +1,126 @@ +// #include "kernels/allocation.h" +// #include "local-execution/loss_functions.h" +// #include "local-execution/optimizer.h" +// #include "pcg/computation_graph.dtg.h" +// #include "pcg/computation_graph.h" +// #include "pcg/optimizer_attrs.h" +// #include "realm-backend/realm_tensor_backing.h" +// #include "task-spec/op_task_to_task_invocation.h" +// #include "task-spec/runtime_arg_config.h" +// #include "task-spec/task_invocation.h" +// #include "task-spec/task_signature_impl.h" +// #include "utils/containers/contains.h" +// #include "utils/containers/contains_key.h" +// #include "utils/containers/get_only.h" +// #include "utils/containers/values.h" +// #include "utils/exception.h" + +// #include "realm-backend/realm_training_backing.h" +// #include "realm-backend/task_result.h" +// #include "realm-backend/task_wrapper.h" + +// namespace FlexFlow { + +// using namespace Realm; + +// RealmTrainingBacking::RealmTrainingBacking( +// Processor master_proc, std::vector const &worker_procs, +// std::vector const &allocators, +// AllocatedTensors const &allocated_tensors, +// GradientTensorSource &gradient_tensor_source, +// ComputationGraph const &computation_graph, +// RuntimeArgConfig const &runtime_arg_config) +// : master_proc(master_proc), master_event(Realm::Event::NO_EVENT), +// master_mem(Machine::MemoryQuery(Machine::get_machine()) +// .only_kind(Memory::SYSTEM_MEM) +// .best_affinity_to(master_proc) +// .first()), +// worker_procs(worker_procs), +// worker_events(std::vector(worker_procs.size(), +// Realm::Event::NO_EVENT)), +// allocators(allocators), computation_graph(computation_graph), +// task_registry(construct_task_registry_and_register_tasks_for_realm( +// computation_graph, worker_procs)), +// realm_tensor_backing(construct_realm_tensor_backing( // TODO: multi gpu +// allocated_tensors, +// generate_unallocated_tensors( +// allocated_tensors, get_all_tensor_attrs(computation_graph), +// gradient_tensor_source), +// this->allocators[0])), +// realm_args_backing(initialize_args_backing(this, computation_graph, runtime_arg_config)) {} + +// TaskRegistry construct_task_registry_and_register_tasks_for_realm( +// ComputationGraph const &cg, std::vector const &worker_procs) { +// TaskRegistry task_registry = construct_task_registry( +// get_layer_attrs_mapping(cg)); + +// // register tasks for realm +// std::unordered_map const &layer_attrs_mapping = +// get_layer_attrs_mapping(cg); +// for (std::pair const &layer_attrs : +// layer_attrs_mapping) { +// ComputationGraphOpAttrs attrs = layer_attrs.second.op_attrs; +// std::vector task_ids = get_task_ids(attrs); +// for (task_id_t task_id : task_ids) { +// TaskSignatureAndImpl task_signature_impl = get_task_sig_impl(task_id); +// // TODO: multi gpu +// register_wrapper_tasks(0, worker_procs[0], task_id, task_signature_impl); +// } +// } + +// return task_registry; +// } + +// RealmArgsBacking +// initialize_args_backing(RealmTrainingBacking *backing, +// ComputationGraph const &cg, +// RuntimeArgConfig const &runtime_arg_config) { +// std::unordered_map +// per_device_op_states; +// TaskRegistry const &task_registry = backing->task_registry; +// RealmTensorBacking const &realm_tensor_backing = +// backing->realm_tensor_backing; +// Processor master_proc = backing->master_proc; +// Memory master_mem = backing->master_mem; +// std::vector &worker_procs = backing->worker_procs; +// std::vector &worker_events = backing->worker_events; +// // TODO: multi gpu +// Allocator &allocator = backing->allocators[0]; + +// for (layer_guid_t const &node : topological_ordering(cg)) { +// if (registry_contains_task_for_layer(task_registry, node, +// OpTaskType::INIT)) { +// ComputationGraphOpAttrs attrs = get_layer_attrs(cg, node).op_attrs; + +// TaskInvocation invocation = lower_to_task_invocation( +// init(attrs), node, get_incoming_inputs(cg, node), +// get_incoming_input_shapes(cg, node), get_outgoing_tensors(cg, node), +// get_incoming_weights(cg, node), +// realm_tensor_backing.tensor_gradient_mapping, std::nullopt); +// TaskArgumentAccessor accessor = get_task_arg_accessor( +// realm_tensor_backing, +// make_args_backing_with_empty_device_states(runtime_arg_config), +// invocation, +// allocator); +// task_id_t task_id = invocation.task_id; +// TaskImplFunction impl_function = +// task_registry.task_mapping.at(task_id).impl_function; +// // TODO: multi gpu launching +// Promise promise = Promise(); +// Future future = promise.get_future(); +// RealmTaskArgs* task_arg = new RealmTaskArgs{ +// task_id, impl_function, accessor, std::move(promise)}; +// uintptr_t args[1] = {reinterpret_cast(task_arg)}; +// Event e = +// worker_procs[0].spawn(get_realm_task_id(task_id), +// args, sizeof(uintptr_t), worker_events[0]); +// worker_events[0] = e; +// future.set_event(e); +// per_device_op_states.insert({node, future.get().value()}); +// } +// } + +// return RealmArgsBacking{runtime_arg_config, per_device_op_states}; +// } + +// } // namespace FlexFlow diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc index 053bf62838..b436443cdb 100644 --- a/lib/realm-backend/src/realm_training_backing.cc +++ b/lib/realm-backend/src/realm_training_backing.cc @@ -1,20 +1,19 @@ -#include "kernels/allocation.h" -#include "local-execution/loss_functions.h" -#include "local-execution/optimizer.h" -#include "pcg/computation_graph.dtg.h" +#include "local-execution/local_args_backing.h" #include "pcg/computation_graph.h" #include "pcg/optimizer_attrs.h" -#include "realm-backend/realm_tensor_backing.h" +#include "task-spec/loss_functions.h" #include "task-spec/op_task_to_task_invocation.h" -#include "task-spec/runtime_arg_config.h" +#include "task-spec/optimizer.h" #include "task-spec/task_invocation.h" #include "task-spec/task_signature_impl.h" +#include "task-spec/training_computation_graph.h" #include "utils/containers/contains.h" #include "utils/containers/contains_key.h" #include "utils/containers/get_only.h" +#include "utils/containers/is_subseteq_of.h" +#include "utils/containers/keys.h" #include "utils/containers/values.h" #include "utils/exception.h" - #include "realm-backend/realm_training_backing.h" #include "realm-backend/task_result.h" #include "realm-backend/task_wrapper.h" @@ -23,327 +22,292 @@ namespace FlexFlow { using namespace Realm; -RealmTrainingBacking::RealmTrainingBacking( - Processor master_proc, std::vector const &worker_procs, - std::vector const &allocators, - AllocatedTensors const &allocated_tensors, - GradientTensorSource &gradient_tensor_source, - ComputationGraph const &computation_graph, - RuntimeArgConfig const &runtime_arg_config) - : master_proc(master_proc), master_event(Realm::Event::NO_EVENT), - master_mem(Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::SYSTEM_MEM) - .best_affinity_to(master_proc) - .first()), - worker_procs(worker_procs), - worker_events(std::vector(worker_procs.size(), - Realm::Event::NO_EVENT)), - allocators(allocators), computation_graph(computation_graph), - task_registry(construct_task_registry_and_register_tasks_for_realm( - computation_graph, worker_procs)), - realm_tensor_backing(construct_realm_tensor_backing( // TODO: multi gpu - allocated_tensors, - generate_unallocated_tensors( - allocated_tensors, get_all_tensor_attrs(computation_graph), - gradient_tensor_source), - this->allocators[0])), - realm_args_backing(initialize_args_backing(this, computation_graph, runtime_arg_config)) {} - -RealmTrainingBacking::RealmTrainingBacking( - Processor master_proc, std::vector const &worker_procs, - std::vector const &allocators, - AllocatedTensors const &allocated_tensors, - GradientTensorSource &gradient_tensor_source, - OptimizerTensorSource &optimizer_tensor_source, - ComputationGraph const &computation_graph, +LocalTrainingBacking make_local_training_backing_for_computation_graph( + RealmRuntimeState &runtime_state, + std::unordered_map const + &preallocated, + TrainingComputationGraph const &training_computation_graph, RuntimeArgConfig const &runtime_arg_config, - OptimizerAttrs const &optimizer_attrs) - : master_proc(master_proc), master_event(Realm::Event::NO_EVENT), - master_mem(Machine::MemoryQuery(Machine::get_machine()) - .only_kind(Memory::SYSTEM_MEM) - .best_affinity_to(master_proc) - .first()), - worker_procs(worker_procs), - worker_events(std::vector(worker_procs.size(), - Realm::Event::NO_EVENT)), - allocators(allocators), computation_graph(computation_graph), - task_registry(construct_task_registry_and_register_tasks_for_realm( - computation_graph, worker_procs)), - realm_tensor_backing(construct_realm_tensor_backing( // TODO: multi gpu - allocated_tensors, - generate_unallocated_tensors_with_optimizer( - allocated_tensors, get_all_tensor_attrs(computation_graph), - gradient_tensor_source, optimizer_tensor_source, - optimizer_attrs), - this->allocators[0])), - realm_args_backing(initialize_args_backing(this, computation_graph, runtime_arg_config)) {} - -TaskRegistry construct_task_registry_and_register_tasks_for_realm( - ComputationGraph const &cg, std::vector const &worker_procs) { - TaskRegistry task_registry = construct_task_registry( - get_layer_attrs_mapping(cg)); - - // register tasks for realm - std::unordered_map const &layer_attrs_mapping = - get_layer_attrs_mapping(cg); - for (std::pair const &layer_attrs : - layer_attrs_mapping) { - ComputationGraphOpAttrs attrs = layer_attrs.second.op_attrs; - std::vector task_ids = get_task_ids(attrs); - for (task_id_t task_id : task_ids) { - TaskSignatureAndImpl task_signature_impl = get_task_sig_impl(task_id); - // TODO: multi gpu - register_wrapper_tasks(0, worker_procs[0], task_id, task_signature_impl); - } - } + OptimizerAttrs const &optimizer_attrs) { + + ASSERT(is_subseteq_of( + keys(preallocated), + keys(get_all_training_tensor_shapes(training_computation_graph)))); + + LocalTaskRegistry local_task_registry = + construct_local_task_registry_for_layers(get_layer_attrs_mapping( + training_computation_graph.computation_graph)); + + register_tasks_for_realm(local_task_registry, runtime_state); + + LocalTensorBacking local_tensor_backing = construct_local_tensor_backing( + get_all_training_tensor_shapes(training_computation_graph), + preallocated, + runtime_state.allocators[0]); + + std::unordered_map> + per_device_op_states = generate_map( + topological_ordering(training_computation_graph.computation_graph), + [&](layer_guid_t const &layer_guid) { + return create_per_device_op_state( + local_task_registry, + local_tensor_backing, + runtime_arg_config, + runtime_state, + get_training_layer_plus_context(training_computation_graph, + layer_guid)); + }); - return task_registry; + LocalArgsBacking local_args_backing = + make_local_args_backing_for_computation_graph(runtime_arg_config, + per_device_op_states); + + return LocalTrainingBacking{ + /*computation_graph=*/training_computation_graph, + /*local_task_registry=*/local_task_registry, + /*local_tensor_backing=*/local_tensor_backing, + /*local_args_backing=*/local_args_backing, + }; } -RealmArgsBacking -initialize_args_backing(RealmTrainingBacking *backing, - ComputationGraph const &cg, - RuntimeArgConfig const &runtime_arg_config) { - std::unordered_map - per_device_op_states; - TaskRegistry const &task_registry = backing->task_registry; - RealmTensorBacking const &realm_tensor_backing = - backing->realm_tensor_backing; - Processor master_proc = backing->master_proc; - Memory master_mem = backing->master_mem; - std::vector &worker_procs = backing->worker_procs; - std::vector &worker_events = backing->worker_events; - // TODO: multi gpu - Allocator &allocator = backing->allocators[0]; - - for (layer_guid_t const &node : topological_ordering(cg)) { - if (registry_contains_task_for_layer(task_registry, node, - OpTaskType::INIT)) { - ComputationGraphOpAttrs attrs = get_layer_attrs(cg, node).op_attrs; - - TaskInvocation invocation = lower_to_task_invocation( - init(attrs), node, get_incoming_inputs(cg, node), - get_incoming_input_shapes(cg, node), get_outgoing_tensors(cg, node), - get_incoming_weights(cg, node), - realm_tensor_backing.tensor_gradient_mapping, std::nullopt); - TaskArgumentAccessor accessor = get_task_arg_accessor( - realm_tensor_backing, - make_args_backing_with_empty_device_states(runtime_arg_config), - invocation, - allocator); - task_id_t task_id = invocation.task_id; - TaskImplFunction impl_function = - task_registry.task_mapping.at(task_id).impl_function; - // TODO: multi gpu launching - Promise promise = Promise(); - Future future = promise.get_future(); - RealmTaskArgs* task_arg = new RealmTaskArgs{ - task_id, impl_function, accessor, std::move(promise)}; - uintptr_t args[1] = {reinterpret_cast(task_arg)}; - Event e = - worker_procs[0].spawn(get_realm_task_id(task_id), - args, sizeof(uintptr_t), worker_events[0]); - worker_events[0] = e; - future.set_event(e); - per_device_op_states.insert({node, future.get().value()}); +// register tasks for realm runtime +void register_tasks_for_realm(LocalTaskRegistry const &local_task_registry, RealmRuntimeState &runtime_state) { + for (std::pair const &task : local_task_registry.task_mapping) { + task_id_t task_id = task.first; + TaskSignatureAndImpl task_signature_impl = task.second; + // TODO: multi gpu + register_wrapper_tasks(0, runtime_state.worker_procs[0], task_id, task_signature_impl); } +} + +std::optional + create_per_device_op_state(LocalTaskRegistry const &local_task_registry, + LocalTensorBacking const &tensor_backing, + RuntimeArgConfig const &runtime_arg_config, + RealmRuntimeState &runtime_state, + TrainingLayerPlusContext const &training_layer) { + std::optional maybe_registered_task = try_get_registered_task( + local_task_registry, training_layer.layer_guid, OpTaskType::INIT); + + ASSERT(maybe_registered_task.has_value()); + + registered_task_t registered_task = maybe_registered_task.value(); + if (registered_task.is_noop_task()) { + return std::nullopt; } - return RealmArgsBacking{runtime_arg_config, per_device_op_states}; + TaskInvocation invocation = lower_to_task_invocation( + /*op_task_invocation=*/get_init_op_task_invocation( + training_layer.layer_attrs.op_attrs), + /*training_layer=*/training_layer, + /*device_specific_device_states=*/std::nullopt); + + TaskArgumentAccessor accessor = get_task_arg_accessor( + tensor_backing, runtime_arg_config, invocation, runtime_state.allocators[0]); + + task_id_t task_id = invocation.task_id; + TaskImplFunction impl_function = + local_task_registry.task_mapping.at(task_id).impl_function; + // TODO: multi gpu launching + Promise promise = Promise(); + Future future = promise.get_future(); + RealmTaskArgs* task_arg = + new RealmTaskArgs{ + task_id, impl_function, accessor, + std::move(promise)}; + uintptr_t args[1] = {reinterpret_cast(task_arg)}; + Event e = runtime_state.worker_procs[0].spawn( + get_realm_task_id(task_id), args, sizeof(uintptr_t), + runtime_state.worker_events[0]); + runtime_state.worker_events[0] = e; + future.set_event(e); + return future.get().value(); } -Future -execute_forward(RealmTrainingBacking &realm_training_backing, - layer_guid_t const &operator_node) { - if (registry_contains_task_for_layer(realm_training_backing.task_registry, - operator_node, OpTaskType::FWD)) { - ComputationGraphOpAttrs attrs = - get_layer_attrs(realm_training_backing.computation_graph, operator_node) - .op_attrs; - std::optional device_state = - get_per_device_op_state_if_exists( - realm_training_backing.realm_args_backing, operator_node); - TaskInvocation invocation = lower_to_task_invocation( - forward(attrs), operator_node, - get_incoming_inputs(realm_training_backing.computation_graph, - operator_node), - get_incoming_input_shapes(realm_training_backing.computation_graph, - operator_node), - get_outgoing_tensors(realm_training_backing.computation_graph, - operator_node), - get_incoming_weights(realm_training_backing.computation_graph, - operator_node), - realm_training_backing.realm_tensor_backing.tensor_gradient_mapping, - device_state); - TaskArgumentAccessor accessor = get_task_arg_accessor( - realm_training_backing.realm_tensor_backing, - realm_training_backing.realm_args_backing, invocation, - realm_training_backing.allocators[0]); - task_id_t task_id = invocation.task_id; - TaskImplFunction impl_function = - realm_training_backing.task_registry.task_mapping.at(task_id) - .impl_function; - // TODO: multi gpu launching - Promise promise(realm_training_backing.master_mem); - Future future = promise.get_future(); - RealmTaskArgs* task_arg = new RealmTaskArgs{task_id, impl_function, accessor, - std::move(promise)}; - uintptr_t args[1] = {reinterpret_cast(task_arg)}; - Event e = realm_training_backing.worker_procs[0].spawn( - get_realm_task_id(task_id), args, sizeof(uintptr_t), - realm_training_backing.worker_events[0]); - realm_training_backing.worker_events[0] = e; - future.set_event(e); - return future; - } else { - return Future(0.0f); +Future> + execute_forward(LocalTaskRegistry const &local_task_registry, + LocalTensorBacking const &local_tensor_backing, + LocalArgsBacking const &local_args_backing, + TrainingLayerPlusContext const &training_layer, + RealmRuntimeState &runtime_state) { + + std::optional maybe_registered_task = try_get_registered_task( + local_task_registry, training_layer.layer_guid, OpTaskType::BWD); + + ASSERT(maybe_registered_task.has_value()); + + registered_task_t registered_task = maybe_registered_task.value(); + if (registered_task.is_noop_task()) { + return Future>(std::nullopt); } + + std::optional device_state = + get_per_device_op_state_if_exists(local_args_backing, + training_layer.layer_guid); + + TaskInvocation invocation = lower_to_task_invocation( + /*op_task_invocation=*/get_forward_op_task_invocation( + training_layer.layer_attrs.op_attrs), + /*training_layer=*/training_layer, + /*device_specific_device_states=*/device_state); + + TaskArgumentAccessor accessor = + get_task_arg_accessor(local_tensor_backing, + local_args_backing.runtime_arg_config, + invocation, + runtime_state.allocators[0]); + + task_id_t task_id = invocation.task_id; + TaskImplFunction impl_function = + local_task_registry.task_mapping.at(task_id).impl_function; + // TODO: multi gpu launching + Promise> promise(runtime_state.master_mem); + Future> future = promise.get_future(); + RealmTaskArgs>* task_arg = + new RealmTaskArgs>{ + task_id, impl_function, accessor, + std::move(promise)}; + uintptr_t args[1] = {reinterpret_cast(task_arg)}; + Event e = runtime_state.worker_procs[0].spawn( + get_realm_task_id(task_id), args, sizeof(uintptr_t), + runtime_state.worker_events[0]); + runtime_state.worker_events[0] = e; + future.set_event(e); + return future; } -Future -execute_backward(RealmTrainingBacking &realm_training_backing, - layer_guid_t const &operator_node) { - if (registry_contains_task_for_layer(realm_training_backing.task_registry, - operator_node, OpTaskType::BWD)) { - ComputationGraphOpAttrs attrs = - get_layer_attrs(realm_training_backing.computation_graph, operator_node) - .op_attrs; - std::optional device_state = - get_per_device_op_state_if_exists( - realm_training_backing.realm_args_backing, operator_node); - TaskInvocation invocation = lower_to_task_invocation( - forward(attrs), operator_node, - get_incoming_inputs(realm_training_backing.computation_graph, - operator_node), - get_incoming_input_shapes(realm_training_backing.computation_graph, - operator_node), - get_outgoing_tensors(realm_training_backing.computation_graph, - operator_node), - get_incoming_weights(realm_training_backing.computation_graph, - operator_node), - realm_training_backing.realm_tensor_backing.tensor_gradient_mapping, - device_state); - TaskArgumentAccessor accessor = get_task_arg_accessor( - realm_training_backing.realm_tensor_backing, - realm_training_backing.realm_args_backing, invocation, - realm_training_backing.allocators[0]); - task_id_t task_id = invocation.task_id; - TaskImplFunction impl_function = - realm_training_backing.task_registry.task_mapping.at(task_id) - .impl_function; - // TODO: multi gpu launching - Promise promise(realm_training_backing.master_mem); - Future future = promise.get_future(); - RealmTaskArgs* task_arg = new RealmTaskArgs{task_id, impl_function, accessor, - std::move(promise)}; - uintptr_t args[1] = {reinterpret_cast(task_arg)}; - Event e = realm_training_backing.worker_procs[0].spawn( - get_realm_task_id(task_id), args, sizeof(uintptr_t), - realm_training_backing.worker_events[0]); - realm_training_backing.worker_events[0] = e; - future.set_event(e); - return future; - } else { - return Future(0.0f); +Future> + execute_backward(LocalTaskRegistry const &local_task_registry, + LocalTensorBacking const &local_tensor_backing, + LocalArgsBacking const &local_args_backing, + TrainingLayerPlusContext const &training_layer, + RealmRuntimeState &runtime_state) { + + std::optional maybe_registered_task = try_get_registered_task( + local_task_registry, training_layer.layer_guid, OpTaskType::BWD); + + ASSERT(maybe_registered_task.has_value()); + + registered_task_t registered_task = maybe_registered_task.value(); + if (registered_task.is_noop_task()) { + return Future>(std::nullopt); } + + std::optional device_state = + get_per_device_op_state_if_exists(local_args_backing, + training_layer.layer_guid); + TaskInvocation invocation = lower_to_task_invocation( + get_backward_op_task_invocation(training_layer.layer_attrs.op_attrs), + training_layer, + device_state); + TaskArgumentAccessor accessor = + get_task_arg_accessor(local_tensor_backing, + local_args_backing.runtime_arg_config, + invocation, + runtime_state.allocators[0]); + + task_id_t task_id = invocation.task_id; + TaskImplFunction impl_function = + local_task_registry.task_mapping.at(task_id).impl_function; + // TODO: multi gpu launching + Promise> promise(runtime_state.master_mem); + Future> future = promise.get_future(); + RealmTaskArgs>* task_arg = + new RealmTaskArgs>{ + task_id, impl_function, accessor, + std::move(promise)}; + uintptr_t args[1] = {reinterpret_cast(task_arg)}; + Event e = runtime_state.worker_procs[0].spawn( + get_realm_task_id(task_id), args, sizeof(uintptr_t), + runtime_state.worker_events[0]); + runtime_state.worker_events[0] = e; + future.set_event(e); + return future; } -Future execute_update(RealmTrainingBacking &realm_training_backing, - layer_guid_t const &node, - OptimizerAttrs const &optimizer_attrs) { - LayerAttrs layer_attrs = - get_layer_attrs(realm_training_backing.computation_graph, node); - if (layer_attrs.op_attrs.has()) { - // get tensors - tensor_guid_t weight_tensor = get_only( - get_outgoing_tensors(realm_training_backing.computation_graph, node)); - - gradient_tensor_t weight_grad_tensor = - realm_training_backing.realm_tensor_backing.tensor_gradient_mapping.at( - weight_tensor); - std::vector optimizer_buffer_tensors = - realm_training_backing.realm_tensor_backing.tensor_optimizer_mapping.at( - weight_tensor); - - // get invocation +Future execute_update(LocalTrainingBacking const &local_training_backing, + layer_guid_t const &layer_guid, + OptimizerAttrs const &optimizer_attrs, + RealmRuntimeState &runtime_state) { + TrainingLayerPlusContext training_layer = get_training_layer_plus_context( + local_training_backing.training_computation_graph, layer_guid); + + if (training_layer.layer_attrs.op_attrs.has()) { + TrainingTensorGroupWithAttrs weight_tensor_group = + get_only(training_layer.output_tensor_groups); + TaskInvocation invocation = - get_update_invocation(optimizer_attrs, weight_tensor, - weight_grad_tensor, optimizer_buffer_tensors); + get_update_invocation(optimizer_attrs, + weight_tensor_group.forward_tensor, + weight_tensor_group.gradient_tensor, + weight_tensor_group.optimizer_tensors); // TODO: https://github.com/flexflow/flexflow-train/issues/1442 // assert(is_invocation_valid(get_update_signature(attrs), invocation)); - // execute update TaskArgumentAccessor accessor = get_task_arg_accessor( - realm_training_backing.realm_tensor_backing, - realm_training_backing.realm_args_backing, invocation, - realm_training_backing.allocators[0]); + local_training_backing.local_tensor_backing, + local_training_backing.local_args_backing.runtime_arg_config, + invocation, + runtime_state.allocators[0]); + TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs); + task_id_t task_id = invocation.task_id; - register_wrapper_tasks_generic(0, realm_training_backing.worker_procs[0], + register_wrapper_tasks_generic(0, runtime_state.worker_procs[0], task_id); - TaskImplFunction update_impl_fn = get_update_task_impl(optimizer_attrs); // TODO: multi gpu launching Promise promise; Future future = promise.get_future(); RealmTaskArgs* task_arg = new RealmTaskArgs{task_id, update_impl_fn, accessor, std::move(promise)}; uintptr_t args[1] = {reinterpret_cast(task_arg)}; - Event e = realm_training_backing.worker_procs[0].spawn( + Event e = runtime_state.worker_procs[0].spawn( get_realm_task_id(task_id), args, sizeof(uintptr_t), - realm_training_backing.worker_events[0]); - realm_training_backing.worker_events[0] = e; + runtime_state.worker_events[0]); + runtime_state.worker_events[0] = e; future.set_event(e); return future; - } else { - return Future(); } } -Future compute_loss(RealmTrainingBacking &realm_training_backing, - LossAttrs const &loss_attrs, - tensor_guid_t const &logit_tensor, - loss_tensor_t const &label_tensor) { +Future compute_loss(LocalTrainingBacking const &local_training_backing, + LossAttrs const &loss_attrs, + RealmRuntimeState &runtime_state) { + + TrainingComputationGraph training_cg = + local_training_backing.training_computation_graph; + tensor_guid_t logit_tensor = training_cg.logit_tensor; + loss_tensor_guid_t label_tensor = training_cg.label_tensor; + TaskInvocation loss_invocation = backward( - loss_attrs, logit_tensor, - realm_training_backing.realm_tensor_backing.tensor_gradient_mapping.at( - logit_tensor), + loss_attrs, + get_forward_tensor_guid_for_tensor_guid(training_cg, logit_tensor), + get_gradient_tensor_guid_for_tensor_guid(training_cg, logit_tensor), label_tensor); // TODO: https://github.com/flexflow/flexflow-train/issues/1442 // assert(is_invocation_valid(get_loss_bwd_signature(), loss_invocation)); TaskArgumentAccessor loss_accessor = get_task_arg_accessor( - realm_training_backing.realm_tensor_backing, - realm_training_backing.realm_args_backing, loss_invocation, - realm_training_backing.allocators[0]); - task_id_t task_id = loss_invocation.task_id; - register_wrapper_tasks_generic(0, realm_training_backing.worker_procs[0], - task_id); + local_training_backing.local_tensor_backing, + local_training_backing.local_args_backing.runtime_arg_config, + loss_invocation, + runtime_state.allocators[0]); TaskImplFunction loss_impl_fn = get_loss_bwd_task_impl(); + + task_id_t task_id = loss_invocation.task_id; + register_wrapper_tasks_generic(0, runtime_state.worker_procs[0], + task_id); // TODO: multi gpu launching Promise promise; Future future = promise.get_future(); RealmTaskArgs* task_arg = new RealmTaskArgs{task_id, loss_impl_fn, loss_accessor, std::move(promise)}; uintptr_t args[1] = {reinterpret_cast(task_arg)}; - Event e = realm_training_backing.worker_procs[0].spawn( + Event e = runtime_state.worker_procs[0].spawn( get_realm_task_id(task_id), args, sizeof(uintptr_t), - realm_training_backing.worker_events[0]); - realm_training_backing.worker_events[0] = e; + runtime_state.worker_events[0]); + runtime_state.worker_events[0] = e; future.set_event(e); return future; } -TaskArgumentAccessor -get_task_arg_accessor(RealmTensorBacking const &realm_tensor_backing, - RealmArgsBacking const &realm_args_backing, - TaskInvocation const &invocation, - Allocator &allocator) { - TensorSlotsBacking tensor_slots_backing = - construct_tensor_slots_backing(realm_tensor_backing, invocation.binding); - ArgSlotsBacking arg_slots_backing = construct_arg_slots_backing( - invocation.binding, realm_args_backing.runtime_arg_config); - // TODO: multi gpu - return TaskArgumentAccessor::create( - allocator, tensor_slots_backing, arg_slots_backing); -} - } // namespace FlexFlow diff --git a/lib/realm-backend/src/task_wrapper.cc b/lib/realm-backend/src/task_wrapper.cc index cb220f44dc..b81494dce4 100644 --- a/lib/realm-backend/src/task_wrapper.cc +++ b/lib/realm-backend/src/task_wrapper.cc @@ -26,12 +26,13 @@ void fwdbwd_wrapper_task(const void *args, size_t arglen, const void *userdata, size_t userlen, Processor p) { assert(arglen == sizeof(uintptr_t)); uintptr_t task_arg_ptr = *reinterpret_cast(args); - RealmTaskArgs *task_args = - reinterpret_cast *>(task_arg_ptr); + RealmTaskArgs> *task_args = + reinterpret_cast> *>(task_arg_ptr); auto fn = task_args->impl_function.get().function_ptr; - std::optional result = fn(task_args->accessor); - task_args->promise.set_value(result.has_value() ? result.value() : 0.0f); + std::optional result = transform( + fn(task_args->accessor), [](float running_time) { return milliseconds_t{running_time}; }); + task_args->promise.set_value(std::move(result)); delete task_args; } diff --git a/lib/realm-backend/test/src/test_e2e.cc b/lib/realm-backend/test/src/test_e2e.cc index fa0976991d..66ff034240 100644 --- a/lib/realm-backend/test/src/test_e2e.cc +++ b/lib/realm-backend/test/src/test_e2e.cc @@ -1,17 +1,25 @@ +#include "test_utils.h" #include "kernels/compare_tensor_accessors.h" +#include "kernels/copy_tensor_accessor.h" #include "kernels/format_accessor_contents.h" +#include "kernels/local_cpu_allocator.h" +#include "kernels/local_cuda_allocator.h" +#include "kernels/managed_ff_stream.h" +#include "kernels/managed_per_device_ff_handle.h" #include "kernels/tensor_accessor_reductions.h" -#include "kernels/test_utils.h" -#include "local-execution/allocated_tensors.h" -#include "realm-backend/realm_allocator.h" #include "realm-backend/realm_training_backing.h" +#include "realm-backend/model_training_instance.h" #include "op-attrs/ops/loss_functions/loss_attrs.dtg.h" #include "pcg/computation_graph.h" #include "pcg/computation_graph_builder.h" #include "pcg/optimizer_attrs.dtg.h" -#include "test_utils.h" +#include "task-spec/forward_tensor_source.h" +#include "task-spec/gradient_tensor_source.h" +#include "task-spec/loss_tensor_source.h" +#include "task-spec/optimizer_tensor_source.h" +#include "task-spec/runtime_arg_config.h" +#include "task-spec/training_computation_graph.h" #include "utils/containers/get_only.h" -#include "realm-backend/model_training_instance.h" using namespace ::FlexFlow; using namespace Realm; @@ -26,158 +34,166 @@ bool did_loss_decrease(GenericTensorAccessorR const &first_epoch, void top_level_task(const void *args, size_t arglen, const void *userdata, size_t userlen, Realm::Processor p) { - // initialize runtime - ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( - /*workSpaceSize=*/1024 * 1024, - /*allowTensorOpMathConversion=*/true); - std::vector worker_procs; - std::vector allocators; - Machine::ProcessorQuery pq = Machine::ProcessorQuery(Machine::get_machine()) - .only_kind(Processor::TOC_PROC); - assert(pq.count() > 0); - for (Processor p : pq) { - worker_procs.push_back(p); - allocators.push_back(create_realm_memory_allocator(p)); - } - - // allocate label tensors - LossTensorSource loss_tensor_source; - loss_tensor_t label_tensor = loss_tensor_source.new_loss_tensor(); - - positive_int batch_size = 10_p; - positive_int data_dim = 16_p; - positive_int hidden_dim = 32_p; - positive_int output_dim = 1_p; - - TensorShape input_tensor_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; - TensorShape output_tensor_shape = TensorShape{ - TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT}; - - GenericTensorAccessorW label_tensor_backing = create_random_filled_accessor_w( - output_tensor_shape, allocators[0]); - - // construct computation graph - ComputationGraph computation_graph = make_empty_computation_graph(); - - TensorShape weight_shape_1 = TensorShape{ - TensorDims{FFOrdered{data_dim, hidden_dim}}, DataType::FLOAT}; - TensorShape weight_shape_2 = TensorShape{ - TensorDims{FFOrdered{hidden_dim, output_dim}}, DataType::FLOAT}; - - GenericTensorAccessorW weight_1_backing = create_random_filled_accessor_w( - weight_shape_1, allocators[0]); - GenericTensorAccessorW weight_2_backing = create_random_filled_accessor_w( - weight_shape_2, allocators[0]); - - LayerAddedResult inputs_layer = - add_input_layer_with_grad(computation_graph, input_tensor_shape); - tensor_guid_t input_tensor_guid = get_only(inputs_layer.outputs); - GenericTensorAccessorW input_tensor_backing = create_random_filled_accessor_w( - input_tensor_shape, allocators[0]); - - LayerAddedResult weights_layer_1 = add_layer( - computation_graph, - LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{ - weight_shape_1, InitializerAttrs{GlorotNormalAttrs{0}}}}, - std::nullopt}, - {}, - {}); - tensor_guid_t weight_1_tensor_guid = get_only(weights_layer_1.outputs); - - LayerAddedResult weights_layer_2 = add_layer( - computation_graph, - LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{ - weight_shape_2, InitializerAttrs{GlorotNormalAttrs{0}}}}, - std::nullopt}, - {}, - {}); - tensor_guid_t weight_2_tensor_guid = get_only(weights_layer_2.outputs); - - LayerAddedResult linear_operator_1 = add_layer( - computation_graph, - LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{hidden_dim, - /*use_bias=*/false, - DataType::FLOAT, - Activation::RELU, - std::nullopt}}, - std::nullopt}, - inputs_layer.outputs, - weights_layer_1.outputs); - - LayerAddedResult linear_operator_2 = add_layer( - computation_graph, - LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim, - /*use_bias=*/false, - DataType::FLOAT, - Activation::RELU, - std::nullopt}}, - std::nullopt}, - linear_operator_1.outputs, - weights_layer_2.outputs); - - tensor_guid_t logit_tensor = get_only(linear_operator_2.outputs); - - RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ - DeviceSpecific::create(managed_handle.raw_handle()), - EnableProfiling::YES, - ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}}; - - // initialize training backing - LossAttrs loss_attrs = LossAttrs{NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}; - OptimizerAttrs optimizer_attrs = - OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, - /*momentum=*/0.9, - /*nesterov=*/false, - /*weight_decay=*/0.001}}; - - - GradientTensorSource gradient_tensor_source; - OptimizerTensorSource optimizer_tensor_source; - - AllocatedTensors allocated_tensors = AllocatedTensors{ - /*tensor_type_backings=*/{ - {TensorTypeVariant{label_tensor}, label_tensor_backing}, - {TensorTypeVariant{input_tensor_guid}, input_tensor_backing}, - {TensorTypeVariant{weight_1_tensor_guid}, weight_1_backing}, - {TensorTypeVariant{weight_2_tensor_guid}, weight_2_backing}, - }, - /*gradient_mapping=*/{}, - /*optimizer_mapping*/ {}, - }; - - { - printf("\nRunning test %d: E2ETest...\n", 1); - RealmTrainingBacking realm_training_backing = RealmTrainingBacking( - p, worker_procs, allocators, allocated_tensors, gradient_tensor_source, - optimizer_tensor_source, computation_graph, runtime_arg_config, - optimizer_attrs); - // begin training loop - ModelTrainingInstance model_training_instance = ModelTrainingInstance{ - realm_training_backing, logit_tensor, label_tensor, loss_attrs, optimizer_attrs + // initialize runtime + ManagedFFStream managed_stream{}; + ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle( + /*workSpaceSize=*/1024 * 1024, + /*allowTensorOpMathConversion=*/true); + + Memory master_mem = Machine::MemoryQuery(Machine::get_machine()) + .only_kind(Memory::SYSTEM_MEM) + .best_affinity_to(p) + .first(); + std::vector worker_procs; + std::vector worker_events; + std::vector allocators; + Machine::ProcessorQuery pq = Machine::ProcessorQuery(Machine::get_machine()) + .only_kind(Processor::TOC_PROC); + assert(pq.count() > 0); + for (Processor p : pq) { + worker_procs.push_back(p); + worker_events.push_back(Event::NO_EVENT); + allocators.push_back(create_realm_memory_allocator(p)); + } + RealmRuntimeState runtime_state = RealmRuntimeState{ + p, Event::NO_EVENT, master_mem, worker_procs, worker_events, allocators}; + + positive_int batch_size = 10_p; + positive_int data_dim = 16_p; + positive_int hidden_dim = 32_p; + positive_int output_dim = 1_p; + + TensorShape output_tensor_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, output_dim}}, DataType::FLOAT}; + + // TODO: multi gpu + GenericTensorAccessorW label_tensor_backing = + runtime_state.allocators[0].allocate_tensor(output_tensor_shape); + + // construct computation graph + ComputationGraph computation_graph = make_empty_computation_graph(); + + TensorShape input_tensor_shape = TensorShape{ + TensorDims{FFOrdered{batch_size, data_dim}}, DataType::FLOAT}; + + TensorShape weight_shape_1 = TensorShape{ + TensorDims{FFOrdered{data_dim, hidden_dim}}, DataType::FLOAT}; + TensorShape weight_shape_2 = TensorShape{ + TensorDims{FFOrdered{hidden_dim, output_dim}}, DataType::FLOAT}; + + LayerAddedResult inputs_layer = + add_input_layer_with_grad(computation_graph, input_tensor_shape); + + LayerAddedResult weights_layer_1 = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{ + weight_shape_1, InitializerAttrs{GlorotNormalAttrs{0}}}}, + std::nullopt}, + {}, + {}); + + LayerAddedResult weights_layer_2 = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{ + weight_shape_2, InitializerAttrs{GlorotNormalAttrs{0}}}}, + std::nullopt}, + {}, + {}); + + LayerAddedResult linear_operator_1 = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{hidden_dim, + /*use_bias=*/false, + DataType::FLOAT, + Activation::RELU, + std::nullopt}}, + std::nullopt}, + inputs_layer.outputs, + weights_layer_1.outputs); + + LayerAddedResult linear_operator_2 = add_layer( + computation_graph, + LayerAttrs{ComputationGraphOpAttrs{LinearAttrs{output_dim, + /*use_bias=*/false, + DataType::FLOAT, + Activation::RELU, + std::nullopt}}, + std::nullopt}, + linear_operator_1.outputs, + weights_layer_2.outputs); + + tensor_guid_t logit_tensor = get_only(linear_operator_2.outputs); + + RuntimeArgConfig runtime_arg_config = gpu_make_runtime_arg_config( + managed_handle.raw_handle(), + EnableProfiling::YES, + ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}); + + // initialize training backing + LossAttrs loss_attrs = LossAttrs{ + NonconfigurableLossAttrs{LossFunction::CATEGORICAL_CROSSENTROPY}}; + OptimizerAttrs optimizer_attrs = OptimizerAttrs{ + SGDOptimizerAttrs{ + /*lr=*/0.001, + /*momentum=*/0.9, + /*nesterov=*/false, + /*weight_decay=*/0.001, + }, }; - Allocator cpu_allocator = create_local_cpu_memory_allocator(); - - int num_epochs = 5; - std::vector loss_values; - - for (int i = 0; i < num_epochs; i++) { - model_training_instance.forward(); - model_training_instance.backward(); - model_training_instance.update(); - loss_values.push_back(copy_tensor_accessor_r( - model_training_instance.get_loss_tensor_accessor(), cpu_allocator)); + ForwardTensorSource forward_tensor_source; + GradientTensorSource gradient_tensor_source; + OptimizerTensorSource optimizer_tensor_source; + LossTensorSource loss_tensor_source; + + TrainingComputationGraph training_computation_graph = + generate_training_computation_graph(computation_graph, + optimizer_attrs, + logit_tensor, + forward_tensor_source, + gradient_tensor_source, + optimizer_tensor_source, + loss_tensor_source); + + LocalTrainingBacking local_training_backing = + make_realm_training_backing_for_computation_graph( + /*runtime_state=*/runtime_state, + /*preallocated_tensors=*/ + { + { + training_tensor_guid_t{ + training_computation_graph.label_tensor}, + label_tensor_backing, + }, + }, + /*training_computation_graph=*/training_computation_graph, + /*runtime_arg_config=*/runtime_arg_config, + /*optimizer_attrs=*/optimizer_attrs); + + // begin training loop + ModelTrainingInstance model_training_instance = ModelTrainingInstance{ + runtime_state, local_training_backing, loss_attrs, optimizer_attrs}; + + { + printf("\nRunning test %d: E2ETest...\n", 1); + Allocator cpu_allocator = create_local_cpu_memory_allocator(); + + int num_epochs = 5; + std::vector loss_values; + + for (int i = 0; i < num_epochs; i++) { + model_training_instance.forward(); + model_training_instance.backward(); + model_training_instance.update(); + loss_values.push_back(copy_tensor_accessor_r( + model_training_instance.get_loss_tensor_accessor(), cpu_allocator)); + } + + // Assert that each sample in the batch has a lower loss in last epoch than + // the first epoch + GenericTensorAccessorR first_epoch_loss = loss_values.at(0); + GenericTensorAccessorR last_epoch = loss_values.back(); + assert(did_loss_decrease(first_epoch_loss, last_epoch)); + printf("passed\n"); } - - // Assert that each sample in the batch has a lower loss in last epoch than - // the first epoch - GenericTensorAccessorR first_epoch_loss = loss_values.at(0); - - GenericTensorAccessorR last_epoch = loss_values.back(); - - assert(did_loss_decrease(first_epoch_loss, last_epoch)); - printf("passed\n"); - } } diff --git a/lib/realm-backend/test/src/test_update.cc b/lib/realm-backend/test/src/test_update.cc index b1f6bebe74..cd7119271d 100644 --- a/lib/realm-backend/test/src/test_update.cc +++ b/lib/realm-backend/test/src/test_update.cc @@ -6,7 +6,7 @@ #include "pcg/optimizer_attrs.dtg.h" #include "realm-backend/driver.h" #include "realm-backend/realm_allocator.h" -#include "realm-backend/realm_training_backing.h" +#include "realm-backend/local_training_backing.h" #include "test_utils.h" using namespace ::FlexFlow; @@ -80,11 +80,11 @@ void top_level_task(const void *args, size_t arglen, const void *userdata, /*momentum=*/0.0f, /*nesterov=*/false, /*weight_decay=*/0.001}}; - RealmTrainingBacking realm_training_backing = RealmTrainingBacking( + LocalTrainingBacking local_training_backing = LocalTrainingBacking( p, worker_procs, allocators, allocated_tensors, gradient_tensor_source, optimizer_tensor_source, computation_graph, runtime_arg_config, optimizer_attrs); - execute_update(realm_training_backing, linear_operator.layer, optimizer_attrs).wait(); + execute_update(local_training_backing, linear_operator.layer, optimizer_attrs).wait(); printf("passed\n"); } @@ -95,11 +95,11 @@ void top_level_task(const void *args, size_t arglen, const void *userdata, /*momentum=*/0.9, /*nesterov=*/false, /*weight_decay=*/0.001}}; - RealmTrainingBacking realm_training_backing = RealmTrainingBacking( + LocalTrainingBacking local_training_backing = LocalTrainingBacking( p, worker_procs, allocators, allocated_tensors, gradient_tensor_source, optimizer_tensor_source, computation_graph, runtime_arg_config, optimizer_attrs); - execute_update(realm_training_backing, linear_operator.layer, optimizer_attrs).wait(); + execute_update(local_training_backing, linear_operator.layer, optimizer_attrs).wait(); printf("passed\n"); } @@ -114,11 +114,11 @@ void top_level_task(const void *args, size_t arglen, const void *userdata, /*beta_t=*/0.9, /*beta2_t=*/0.999, /*epsilon=*/1e-8}}; - RealmTrainingBacking realm_training_backing = RealmTrainingBacking( + LocalTrainingBacking local_training_backing = LocalTrainingBacking( p, worker_procs, allocators, allocated_tensors, gradient_tensor_source, optimizer_tensor_source, computation_graph, runtime_arg_config, optimizer_attrs); - execute_update(realm_training_backing, linear_operator.layer, optimizer_attrs).wait(); + execute_update(local_training_backing, linear_operator.layer, optimizer_attrs).wait(); printf("passed\n"); } } diff --git a/lib/runtime/src/ops/embedding.cc b/lib/runtime/src/ops/embedding.cc deleted file mode 100644 index 83e7c15460..0000000000 --- a/lib/runtime/src/ops/embedding.cc +++ /dev/null @@ -1,120 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "embedding.h" -#include "kernels/embedding_kernels.h" -#include "op-attrs/get_output_shapes.h" -#include "op-attrs/ops/embedding.h" - -namespace FlexFlow { - -using namespace FlexFlow::Kernels::Embedding; - -enum Slots { INPUT, WEIGHT, OUTPUT, ATTRS, PROFILING }; - -OpTaskInvocation forward(EmbeddingAttrs const &attrs) { - OpTaskBinding b; - - b.bind(INPUT, input_tensor(0)); - b.bind(WEIGHT, weight_tensor(0)); - b.bind(OUTPUT, output_tensor(0)); - - b.bind_arg(ATTRS, attrs); - b.bind_arg(PROFILING, profiling_settings()); - - return {EMBED_FWD_TASK_ID, b}; -} - -OpTaskInvocation backward(EmbeddingAttrs const &attrs) { - OpTaskBinding b = infer_bwd_binding(forward(attrs).binding); - - return {EMBED_BWD_TASK_ID, b}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - auto input = acc.get_tensor(INPUT); - auto weight = acc.get_tensor(WEIGHT); - auto output = acc.get_tensor(OUTPUT); - - ProfilingSettings profiling = acc.get_argument(PROFILING); - EmbeddingAttrs attrs = acc.get_argument(ATTRS); - - return profile(forward_kernel, - profiling, - "[Embedding] forward_time = {:.2lf}ms\n", - input, - output, - weight, - input.data_type, - output.data_type, - attrs.aggr, - input.shape.get_dim(), - output.shape.get_dim(), - input.shape[legion_dim_t(1)]); -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - auto weight_grad = acc.get_tensor_grad(WEIGHT); - - ProfilingSettings profiling = acc.get_argument(PROFILING); - EmbeddingAttrs attrs = acc.get_argument(ATTRS); - - return profile(backward_kernel, - profiling, - "[Embedding] backward_time = {:.2lf}ms\n", - output, - input, - weight_grad, - output.data_type, - input.data_type, - attrs.aggr, - input.shape.get_dim(), - output.shape.get_dim(), - input.shape.at(ff_dim_t{nonnegative_int{0}})); -} - -TaskImplFunction get_embedding_fwd_task_impl() { - return forward_task_impl; -} -TaskImplFunction get_embedding_bwd_task_impl() { - return backward_task_impl; -} - -OpTaskSignature get_embedding_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_input_slot(INPUT); - fwd.add_input_slot(OUTPUT); - fwd.add_input_slot(WEIGHT); - - fwd.add_arg_slot(ATTRS); - fwd.add_arg_slot(PROFILING); - - return fwd; -} - -OpTaskSignature get_embedding_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_embedding_fwd_signature()); - return bwd; -} - -std::vector get_task_ids(EmbeddingAttrs const &) { - return {EMBED_FWD_TASK_ID, EMBED_BWD_TASK_ID}; -} - -} // namespace FlexFlow diff --git a/lib/task-spec/include/task-spec/concrete_arg.h b/lib/task-spec/include/task-spec/concrete_arg_spec.h similarity index 89% rename from lib/task-spec/include/task-spec/concrete_arg.h rename to lib/task-spec/include/task-spec/concrete_arg_spec.h index 7b2ece59a7..24a96e9f78 100644 --- a/lib/task-spec/include/task-spec/concrete_arg.h +++ b/lib/task-spec/include/task-spec/concrete_arg_spec.h @@ -1,5 +1,5 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_CONCRETE_ARG_H -#define _FLEXFLOW_LOCAL_EXECUTION_CONCRETE_ARG_H +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_CONCRETE_ARG_SPEC_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_CONCRETE_ARG_SPEC_H #include "fmt/format.h" #include "task-spec/serialization.h" @@ -15,7 +15,7 @@ struct ConcreteArgSpec { template T const &get() const { - assert(matches(this->type_idx)); + ASSERT(matches(this->type_idx), this->type_idx.name()); return *(T const *)ptr.get(); } diff --git a/lib/task-spec/include/task-spec/device_specific_device_states.variant.toml b/lib/task-spec/include/task-spec/device_specific_device_states.variant.toml index 944dddc3df..b77850c50d 100644 --- a/lib/task-spec/include/task-spec/device_specific_device_states.variant.toml +++ b/lib/task-spec/include/task-spec/device_specific_device_states.variant.toml @@ -5,82 +5,71 @@ features = [ ] includes = [ - "kernels/attention_kernels.h", + "kernels/mha_per_device_state.dtg.h", "kernels/batch_norm_per_device_state.dtg.h", - "kernels/conv_2d_kernels.h", - "kernels/dropout_kernels.h", - "kernels/element_binary_kernels.h", - "kernels/element_unary_kernels.h", - "kernels/gather_kernels.h", - "kernels/layer_norm_kernels.h", - "kernels/linear_kernels.h", - "kernels/partition_kernels.h", - "kernels/pool_2d_kernels.h", - "kernels/reduce_kernels.h", - "kernels/reduction_kernels.h", - "kernels/reshape_kernels.h", - "kernels/softmax_kernels.h", - "kernels/topk_kernels.h", - "kernels/transpose_kernels.h", + "kernels/conv_2d_per_device_state.dtg.h", + "kernels/dropout_per_device_state.dtg.h", + "kernels/element_binary_per_device_state.dtg.h", + "kernels/element_unary_per_device_state.dtg.h", + "kernels/gather_per_device_state.dtg.h", + "kernels/layer_norm_per_device_state.dtg.h", + "kernels/linear_per_device_state.dtg.h", + "kernels/partition_per_device_state.dtg.h", + "kernels/pool_2d_per_device_state.dtg.h", + "kernels/reduce_per_device_state.dtg.h", + "kernels/softmax_per_device_state.dtg.h", "task-spec/device_specific.h", + "", ] [[values]] -type = "::FlexFlow::DeviceSpecific<::FlexFlow::MHAPerDeviceState>" +type = "::FlexFlow::DeviceSpecific>" key = "device_specific_mha_per_device_state" [[values]] -type = "::FlexFlow::DeviceSpecific<::FlexFlow::BatchNormPerDeviceState>" +type = "::FlexFlow::DeviceSpecific>" key = "device_specific_batch_norm_per_device_state" [[values]] -type = "::FlexFlow::DeviceSpecific<::FlexFlow::Conv2DPerDeviceState>" +type = "::FlexFlow::DeviceSpecific>" key = "device_specific_conv2d_per_device_state" [[values]] -type = "::FlexFlow::DeviceSpecific<::FlexFlow::DropoutPerDeviceState>" +type = "::FlexFlow::DeviceSpecific>" key = "device_specific_dropout_per_device_state" [[values]] -type = "::FlexFlow::DeviceSpecific<::FlexFlow::ElementBinaryPerDeviceState>" +type = "::FlexFlow::DeviceSpecific>" key = "device_specific_element_binary_per_device_state" [[values]] -type = "::FlexFlow::DeviceSpecific<::FlexFlow::ElementUnaryPerDeviceState>" +type = "::FlexFlow::DeviceSpecific>" key = "device_specific_element_unary_per_device_state" [[values]] -type = "::FlexFlow::DeviceSpecific<::FlexFlow::GatherPerDeviceState>" +type = "::FlexFlow::DeviceSpecific>" key = "device_specific_gather_per_device_state" [[values]] -type = "::FlexFlow::DeviceSpecific<::FlexFlow::LayerNormPerDeviceState>" +type = "::FlexFlow::DeviceSpecific>" key = "device_specific_layer_norm_per_device_state" [[values]] -type = "::FlexFlow::DeviceSpecific<::FlexFlow::LinearPerDeviceState>" +type = "::FlexFlow::DeviceSpecific>" key = "device_specific_linear_per_device_state" [[values]] -type = "::FlexFlow::DeviceSpecific<::FlexFlow::Pool2DPerDeviceState>" +type = "::FlexFlow::DeviceSpecific>" key = "device_specific_pool_2d_per_device_state" [[values]] -type = "::FlexFlow::DeviceSpecific<::FlexFlow::ReducePerDeviceState>" +type = "::FlexFlow::DeviceSpecific>" key = "device_specific_reduce_per_device_state" [[values]] -type = "::FlexFlow::DeviceSpecific<::FlexFlow::RepartitionPerDeviceState>" +type = "::FlexFlow::DeviceSpecific>" key = "device_specific_repartition_per_device_state" [[values]] -type = "::FlexFlow::DeviceSpecific<::FlexFlow::ReshapePerDeviceState>" -key = "device_specific_reshape_per_device_state" - -[[values]] -type = "::FlexFlow::DeviceSpecific<::FlexFlow::SoftmaxPerDeviceState>" +type = "::FlexFlow::DeviceSpecific>" key = "device_specific_softmax_per_device_state" - -[[values]] -type = "::FlexFlow::DeviceSpecific<::FlexFlow::TopKPerDeviceState>" -key = "device_specific_topk_per_device_state" diff --git a/lib/task-spec/include/task-spec/optimizer_tensor_t.struct.toml b/lib/task-spec/include/task-spec/forward_tensor_guid_t.struct.toml similarity index 79% rename from lib/task-spec/include/task-spec/optimizer_tensor_t.struct.toml rename to lib/task-spec/include/task-spec/forward_tensor_guid_t.struct.toml index 5d3e05f673..68fc4b6815 100644 --- a/lib/task-spec/include/task-spec/optimizer_tensor_t.struct.toml +++ b/lib/task-spec/include/task-spec/forward_tensor_guid_t.struct.toml @@ -1,5 +1,5 @@ namespace = "FlexFlow" -name = "optimizer_tensor_t" +name = "forward_tensor_guid_t" features = [ "eq", "ord", diff --git a/lib/task-spec/include/task-spec/forward_tensor_source.h b/lib/task-spec/include/task-spec/forward_tensor_source.h new file mode 100644 index 0000000000..7adde6e145 --- /dev/null +++ b/lib/task-spec/include/task-spec/forward_tensor_source.h @@ -0,0 +1,22 @@ +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_FORWARD_TENSOR_SOURCE_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_FORWARD_TENSOR_SOURCE_H + +#include "task-spec/forward_tensor_guid_t.dtg.h" + +namespace FlexFlow { + +struct ForwardTensorSource { +public: + ForwardTensorSource(); + + forward_tensor_guid_t new_forward_tensor(); + + void reset(); + +private: + static int next_available_forward_tensor_id; +}; + +} // namespace FlexFlow + +#endif diff --git a/lib/task-spec/include/task-spec/gradient_tensor_t.struct.toml b/lib/task-spec/include/task-spec/gradient_tensor_guid_t.struct.toml similarity index 78% rename from lib/task-spec/include/task-spec/gradient_tensor_t.struct.toml rename to lib/task-spec/include/task-spec/gradient_tensor_guid_t.struct.toml index 5367ccee07..b75e27a9d2 100644 --- a/lib/task-spec/include/task-spec/gradient_tensor_t.struct.toml +++ b/lib/task-spec/include/task-spec/gradient_tensor_guid_t.struct.toml @@ -1,5 +1,5 @@ namespace = "FlexFlow" -name = "gradient_tensor_t" +name = "gradient_tensor_guid_t" features = [ "eq", "ord", diff --git a/lib/task-spec/include/task-spec/gradient_tensor_source.h b/lib/task-spec/include/task-spec/gradient_tensor_source.h new file mode 100644 index 0000000000..14ebf05d43 --- /dev/null +++ b/lib/task-spec/include/task-spec/gradient_tensor_source.h @@ -0,0 +1,22 @@ +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_GRADIENT_TENSOR_SOURCE_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_GRADIENT_TENSOR_SOURCE_H + +#include "task-spec/gradient_tensor_guid_t.dtg.h" + +namespace FlexFlow { + +struct GradientTensorSource { +public: + GradientTensorSource(); + + gradient_tensor_guid_t new_gradient_tensor(); + + void reset(); + +private: + static int next_available_gradient_tensor_id; +}; + +} // namespace FlexFlow + +#endif diff --git a/lib/task-spec/include/task-spec/init_op_task_impl_function.h b/lib/task-spec/include/task-spec/init_op_task_impl_function.h index f82d249df1..97daa7ef56 100644 --- a/lib/task-spec/include/task-spec/init_op_task_impl_function.h +++ b/lib/task-spec/include/task-spec/init_op_task_impl_function.h @@ -7,15 +7,16 @@ namespace FlexFlow { struct InitOpTaskImplFunction { - - DeviceSpecificDeviceStates (*function_ptr)(TaskArgumentAccessor const &); - +public: bool operator==(InitOpTaskImplFunction const &) const; bool operator!=(InitOpTaskImplFunction const &) const; bool operator<(InitOpTaskImplFunction const &) const; bool operator>(InitOpTaskImplFunction const &) const; bool operator<=(InitOpTaskImplFunction const &) const; bool operator>=(InitOpTaskImplFunction const &) const; + +public: + DeviceSpecificDeviceStates (*function_ptr)(TaskArgumentAccessor const &); }; std::string format_as(InitOpTaskImplFunction const &x); diff --git a/lib/task-spec/include/task-spec/itask_argument_accessor.h b/lib/task-spec/include/task-spec/itask_argument_accessor.h index e7d1a81760..2e693e7983 100644 --- a/lib/task-spec/include/task-spec/itask_argument_accessor.h +++ b/lib/task-spec/include/task-spec/itask_argument_accessor.h @@ -2,7 +2,7 @@ #define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_ITASK_ARGUMENT_ACCESSOR_H #include "kernels/allocation.h" -#include "task-spec/concrete_arg.h" +#include "task-spec/concrete_arg_spec.h" #include "task-spec/op_task_signature.h" #include "task-spec/privilege_tensor_accessor.h" #include "task-spec/tensor_type.dtg.h" diff --git a/lib/local-execution/include/local-execution/loss_functions.h b/lib/task-spec/include/task-spec/loss_functions.h similarity index 69% rename from lib/local-execution/include/local-execution/loss_functions.h rename to lib/task-spec/include/task-spec/loss_functions.h index c75d4414de..a5f5886caa 100644 --- a/lib/local-execution/include/local-execution/loss_functions.h +++ b/lib/task-spec/include/task-spec/loss_functions.h @@ -13,12 +13,13 @@ * limitations under the License. */ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_ -#define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_LOSS_FUNCTIONS_H_ +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_LOSS_FUNCTIONS_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_LOSS_FUNCTIONS_H #include "op-attrs/ops/loss_functions.h" -#include "pcg/tensor_guid_t.dtg.h" -#include "task-spec/loss_tensor_t.dtg.h" +#include "task-spec/forward_tensor_guid_t.dtg.h" +#include "task-spec/gradient_tensor_guid_t.dtg.h" +#include "task-spec/loss_tensor_guid_t.dtg.h" #include "task-spec/task_impl_function.dtg.h" #include "task-spec/task_invocation.dtg.h" #include "task-spec/task_signature.h" @@ -28,9 +29,9 @@ namespace FlexFlow { TaskImplFunction get_loss_bwd_task_impl(); TaskSignature get_loss_bwd_signature(); TaskInvocation backward(LossAttrs const &, - tensor_guid_t logit, - gradient_tensor_t logit_grad, - loss_tensor_t label); + forward_tensor_guid_t logit, + gradient_tensor_guid_t logit_grad, + loss_tensor_guid_t label); } // namespace FlexFlow diff --git a/lib/task-spec/include/task-spec/loss_tensor_t.struct.toml b/lib/task-spec/include/task-spec/loss_tensor_guid_t.struct.toml similarity index 87% rename from lib/task-spec/include/task-spec/loss_tensor_t.struct.toml rename to lib/task-spec/include/task-spec/loss_tensor_guid_t.struct.toml index 405385069f..c00ccbb0f2 100644 --- a/lib/task-spec/include/task-spec/loss_tensor_t.struct.toml +++ b/lib/task-spec/include/task-spec/loss_tensor_guid_t.struct.toml @@ -1,5 +1,5 @@ namespace = "FlexFlow" -name = "loss_tensor_t" +name = "loss_tensor_guid_t" features = [ "eq", "ord", diff --git a/lib/local-execution/include/local-execution/loss_tensor_source.h b/lib/task-spec/include/task-spec/loss_tensor_source.h similarity index 50% rename from lib/local-execution/include/local-execution/loss_tensor_source.h rename to lib/task-spec/include/task-spec/loss_tensor_source.h index b794207c7f..21091109e5 100644 --- a/lib/local-execution/include/local-execution/loss_tensor_source.h +++ b/lib/task-spec/include/task-spec/loss_tensor_source.h @@ -1,7 +1,7 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_LOSS_TENSOR_SOURCE_H -#define _FLEXFLOW_LOCAL_EXECUTION_LOSS_TENSOR_SOURCE_H +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_LOSS_TENSOR_SOURCE_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_LOSS_TENSOR_SOURCE_H -#include "task-spec/loss_tensor_t.dtg.h" +#include "task-spec/loss_tensor_guid_t.dtg.h" #include "utils/nonnegative_int/nonnegative_int.h" namespace FlexFlow { @@ -10,7 +10,7 @@ struct LossTensorSource { public: LossTensorSource(); - loss_tensor_t new_loss_tensor(); + loss_tensor_guid_t new_loss_tensor(); private: static nonnegative_int next_available_loss_tensor_id; diff --git a/lib/task-spec/include/task-spec/op_arg_ref.h b/lib/task-spec/include/task-spec/op_arg_ref.h index d95573787a..88882abd46 100644 --- a/lib/task-spec/include/task-spec/op_arg_ref.h +++ b/lib/task-spec/include/task-spec/op_arg_ref.h @@ -15,14 +15,16 @@ using OpArgRef = ArgRef; using OpArgRefSpec = ArgRefSpec; template -OpArgRef per_device_op_state() { +OpArgRef per_device_op_state() { OpArgRefType op_arg_ref_type = OpArgRefType{PerDeviceOpStateRefType{}}; static_assert(PerDeviceOpState::IsPartOfPerDeviceOpState_v); - ArgRef arg_ref = {op_arg_ref_type}; + ArgRef arg_ref = {op_arg_ref_type}; return arg_ref; } -OpArgRef input_parallel_tensor_shape(int idx); +OpArgRef input_parallel_tensor_shape(nonnegative_int idx); +OpArgRef weight_parallel_tensor_shape(nonnegative_int idx); +OpArgRef output_parallel_tensor_shape(nonnegative_int idx); } // namespace FlexFlow diff --git a/lib/task-spec/include/task-spec/op_arg_spec.variant.toml b/lib/task-spec/include/task-spec/op_arg_spec.variant.toml index e52e5c914e..a03bc222e8 100644 --- a/lib/task-spec/include/task-spec/op_arg_spec.variant.toml +++ b/lib/task-spec/include/task-spec/op_arg_spec.variant.toml @@ -10,7 +10,7 @@ features = [ ] includes = [ - "task-spec/concrete_arg.h", + "task-spec/concrete_arg_spec.h", "task-spec/op_arg_ref.h", "task-spec/runtime_arg_ref.h", ] diff --git a/lib/task-spec/include/task-spec/op_task_binding.h b/lib/task-spec/include/task-spec/op_task_binding.h new file mode 100644 index 0000000000..bcfea33877 --- /dev/null +++ b/lib/task-spec/include/task-spec/op_task_binding.h @@ -0,0 +1,97 @@ +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OP_TASK_BINDING_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OP_TASK_BINDING_H + +#include "task-spec/op_arg_ref.h" +#include "task-spec/op_arg_spec.dtg.h" +#include "task-spec/op_tensor_spec.h" +#include "task-spec/slot_grad_id.dtg.h" +#include "task-spec/slot_id_t.dtg.h" +#include "task-spec/variadic_tensor_ref.h" + +namespace FlexFlow { + +struct OpTaskBinding { + OpTaskBinding() = default; + + void bind(int, VariadicTensorRef const &); + void bind(slot_id_t, VariadicTensorRef const &); + + void bind(int, OpTensorSpec const &); + void bind(slot_id_t, OpTensorSpec const &); + + void bind_grad(int, OpTensorSpec const &); + void bind_grad(slot_id_t, OpTensorSpec const &); + + template + void bind_device_specific_arg(int name, T const &t) { + this->bind_device_specific_arg(slot_id_t{name}, t); + } + + template + void bind_device_specific_arg(slot_id_t name, T const &t) { + NOT_IMPLEMENTED(); + } + + template + void bind_device_specific_arg(int name, OpArgRef const &t) { + this->bind_device_specific_arg(slot_id_t{name}, t); + } + + template + void bind_device_specific_arg(slot_id_t name, OpArgRef const &t) { + NOT_IMPLEMENTED(); + } + + template + void bind_arg(int name, T const &t) { + this->bind_arg(slot_id_t{name}, t); + } + + template + void bind_arg(slot_id_t name, T const &t) { + this->insert_arg_spec(name, OpArgSpec{ConcreteArgSpec::create(t)}); + } + + template + void bind_arg(int name, RuntimeArgRef const &t) { + this->bind_arg(slot_id_t{name}, t); + } + + template + void bind_arg(slot_id_t name, RuntimeArgRef const &ref) { + this->insert_arg_spec(name, OpArgSpec{RuntimeArgRefSpec::create(ref)}); + } + + template + void bind_arg(int name, OpArgRef const &t) { + this->bind_arg(slot_id_t{name}, t); + } + + template + void bind_arg(slot_id_t name, OpArgRef const &ref) { + this->insert_arg_spec(name, OpArgSpec{OpArgRefSpec::create(ref)}); + } + bool operator==(OpTaskBinding const &other) const; + bool operator!=(OpTaskBinding const &other) const; + + std::unordered_map const & + get_tensor_bindings() const; + std::unordered_map const &get_arg_bindings() const; + + void bind_from_forward(OpTaskBinding const &fwd); + +private: + std::unordered_map tensor_bindings; + std::unordered_map arg_bindings; + +private: + void insert_arg_spec(slot_id_t name, OpArgSpec const &arg_spec); + std::tuple + tie() const; +}; + +OpTaskBinding infer_bwd_binding(OpTaskBinding const &fwd); + +} // namespace FlexFlow + +#endif diff --git a/lib/task-spec/include/task-spec/op_task_invocation.h b/lib/task-spec/include/task-spec/op_task_invocation.h index cce0a4d6a6..88e9e9bf26 100644 --- a/lib/task-spec/include/task-spec/op_task_invocation.h +++ b/lib/task-spec/include/task-spec/op_task_invocation.h @@ -1,118 +1,11 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_INVOCATION_H #define _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_INVOCATION_H -#include "kernels/accessor.h" -#include "task-spec/concrete_arg.h" -#include "task-spec/is_trainable.dtg.h" -#include "task-spec/op_arg_ref.h" -#include "task-spec/op_arg_spec.dtg.h" +#include "task-spec/op_task_invocation.dtg.h" #include "task-spec/op_task_signature.h" -#include "task-spec/op_tensor_spec.h" -#include "task-spec/profiling.h" -#include "task-spec/runtime_arg_ref.h" -#include "task-spec/slot_grad_id.dtg.h" -#include "task-spec/task_id_t.dtg.h" -#include "task-spec/variadic_tensor_ref.h" -#include -#include -#include -#include namespace FlexFlow { -struct OpTaskBinding { - OpTaskBinding() = default; - - void bind(int, VariadicTensorRef const &); - void bind(slot_id_t, VariadicTensorRef const &); - - void bind(int, OpTensorSpec const &); - void bind(slot_id_t, OpTensorSpec const &); - - void bind_grad(int, OpTensorSpec const &); - void bind_grad(slot_id_t, OpTensorSpec const &); - - template - void bind_device_specific_arg(int name, T const &t) { - this->bind_device_specific_arg(slot_id_t{name}, t); - } - - template - void bind_device_specific_arg(slot_id_t name, T const &t) { - NOT_IMPLEMENTED(); - } - - template - void bind_device_specific_arg(int name, OpArgRef const &t) { - this->bind_device_specific_arg(slot_id_t{name}, t); - } - - template - void bind_device_specific_arg(slot_id_t name, OpArgRef const &t) { - NOT_IMPLEMENTED(); - } - - template - void bind_arg(int name, T const &t) { - this->bind_arg(slot_id_t{name}, t); - } - - template - void bind_arg(slot_id_t name, T const &t) { - this->insert_arg_spec(name, OpArgSpec{ConcreteArgSpec::create(t)}); - } - - template - void bind_arg(int name, RuntimeArgRef const &t) { - this->bind_arg(slot_id_t{name}, t); - } - - template - void bind_arg(slot_id_t name, RuntimeArgRef const &ref) { - this->insert_arg_spec(name, OpArgSpec{RuntimeArgRefSpec::create(ref)}); - } - - template - void bind_arg(int name, OpArgRef const &t) { - this->bind_arg(slot_id_t{name}, t); - } - - template - void bind_arg(slot_id_t name, OpArgRef const &ref) { - this->insert_arg_spec(name, OpArgSpec{OpArgRefSpec::create(ref)}); - } - bool operator==(OpTaskBinding const &other) const; - bool operator!=(OpTaskBinding const &other) const; - - std::unordered_map const & - get_tensor_bindings() const; - std::unordered_map const &get_arg_bindings() const; - - void bind_from_forward(OpTaskBinding const &fwd); - -private: - std::unordered_map tensor_bindings; - std::unordered_map arg_bindings; - -private: - void insert_arg_spec(slot_id_t name, OpArgSpec const &arg_spec); - std::tuple - tie() const; -}; - -struct OpTaskInvocation { -public: - OpTaskInvocation() = delete; - OpTaskInvocation(task_id_t task_id, OpTaskBinding const &binding) - : task_id(task_id), binding(binding) {} - -public: - task_id_t task_id; - OpTaskBinding binding; -}; - -OpTaskBinding infer_bwd_binding(OpTaskBinding const &fwd); - bool is_invocation_valid(OpTaskSignature const &sig, OpTaskInvocation const &inv); diff --git a/lib/task-spec/include/task-spec/op_task_invocation.struct.toml b/lib/task-spec/include/task-spec/op_task_invocation.struct.toml new file mode 100644 index 0000000000..465fa5f1ff --- /dev/null +++ b/lib/task-spec/include/task-spec/op_task_invocation.struct.toml @@ -0,0 +1,16 @@ +namespace = "FlexFlow" +name = "OpTaskInvocation" +features = [] + +includes = [ + "task-spec/op_task_binding.h", + "task-spec/task_id_t.dtg.h", +] + +[[fields]] +name = "task_id" +type = "::FlexFlow::task_id_t" + +[[fields]] +name = "binding" +type = "::FlexFlow::OpTaskBinding" diff --git a/lib/task-spec/include/task-spec/op_task_to_task_invocation.h b/lib/task-spec/include/task-spec/op_task_to_task_invocation.h index 68c7f05d77..3208e9d049 100644 --- a/lib/task-spec/include/task-spec/op_task_to_task_invocation.h +++ b/lib/task-spec/include/task-spec/op_task_to_task_invocation.h @@ -1,31 +1,42 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_TO_TASK_INVOCATION_H #define _FLEXFLOW_LOCAL_EXECUTION_OP_TASK_TO_TASK_INVOCATION_H +#include "pcg/cg_operator_tensor_shape_signature.dtg.h" #include "pcg/computation_graph.dtg.h" #include "pcg/layer_guid_t.dtg.h" #include "task-spec/device_specific_device_states.dtg.h" #include "task-spec/op_task_invocation.h" -#include "task-spec/runtime_arg_config.h" +#include "task-spec/runtime_arg_config.dtg.h" #include "task-spec/task_invocation.dtg.h" +#include "task-spec/training_layer_plus_context.dtg.h" +#include "task-spec/training_layer_tensor_group_signature.dtg.h" namespace FlexFlow { -TaskInvocation lower_to_task_invocation( - OpTaskInvocation const &, - layer_guid_t const &, - std::vector const &input_tensors, - std::vector const &input_tensor_shapes, - std::vector const &output_tensors, - std::vector const &weight_tensors, - std::unordered_map const &, - std::optional const &); +TaskInvocation + lower_to_task_invocation(OpTaskInvocation const &op_task_invocation, + TrainingLayerPlusContext const &training_layer, + std::optional const + &device_specific_device_states); + +std::pair lower_tensor_binding( + TrainingLayerTensorGroupSignature const &training_layer_signature, + SlotGradId const &slot_grad_id, + OpTensorSpec const &op_tensor_spec); + +TaskArgSpec lower_to_task_arg_spec( + OpArgSpec const &op_arg_spec, + CGOperatorTensorShapeSignature const &op_shape_signature, + layer_guid_t const &layer_guid, + std::optional const + &device_specific_device_states); ConcreteArgSpec lower_to_concrete_arg_spec(RuntimeArgRefSpec const &, RuntimeArgConfig const &); ConcreteArgSpec lower_to_concrete_arg_spec( OpArgRefSpec const &, - std::vector const &, + CGOperatorTensorShapeSignature const &, layer_guid_t const &, std::optional const &); diff --git a/lib/task-spec/include/task-spec/op_tensor_slot_spec.struct.toml b/lib/task-spec/include/task-spec/op_tensor_slot_spec.struct.toml index 109ddf36af..3a388b8559 100644 --- a/lib/task-spec/include/task-spec/op_tensor_slot_spec.struct.toml +++ b/lib/task-spec/include/task-spec/op_tensor_slot_spec.struct.toml @@ -10,7 +10,7 @@ features = [ includes = [ "task-spec/slot_id_t.dtg.h", "task-spec/slot_type.dtg.h", - "task-spec/tensor_role.dtg.h", + "pcg/tensor_role.dtg.h", "task-spec/is_grad.dtg.h", "task-spec/op_slot_options.dtg.h", ] diff --git a/lib/task-spec/include/task-spec/op_tensor_spec.h b/lib/task-spec/include/task-spec/op_tensor_spec.h index c957704a10..6f00a2e38d 100644 --- a/lib/task-spec/include/task-spec/op_tensor_spec.h +++ b/lib/task-spec/include/task-spec/op_tensor_spec.h @@ -1,21 +1,15 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_OP_TENSOR_SPEC_REF_H #define _FLEXFLOW_LOCAL_EXECUTION_OP_TENSOR_SPEC_REF_H -#include "task-spec/op_task_signature.h" +#include "task-spec/op_tensor_spec.dtg.h" namespace FlexFlow { -struct OpTensorSpec { - TensorRole role; - OpSlotOptions slot_option; - req idx; -}; -FF_VISITABLE_STRUCT(OpTensorSpec, role, slot_option, idx); - -OpTensorSpec input_tensor(int, OpSlotOptions option = OpSlotOptions::NECESSARY); -OpTensorSpec output_tensor(int, +OpTensorSpec input_tensor(nonnegative_int idx, + OpSlotOptions option = OpSlotOptions::NECESSARY); +OpTensorSpec output_tensor(nonnegative_int idx, OpSlotOptions option = OpSlotOptions::NECESSARY); -OpTensorSpec weight_tensor(int, +OpTensorSpec weight_tensor(nonnegative_int idx, OpSlotOptions option = OpSlotOptions::NECESSARY); } // namespace FlexFlow diff --git a/lib/task-spec/include/task-spec/op_tensor_spec.struct.toml b/lib/task-spec/include/task-spec/op_tensor_spec.struct.toml new file mode 100644 index 0000000000..3e790c7e08 --- /dev/null +++ b/lib/task-spec/include/task-spec/op_tensor_spec.struct.toml @@ -0,0 +1,28 @@ +namespace = "FlexFlow" +name = "OpTensorSpec" +features = [ + "eq", + "ord", + "hash", + "json", + "fmt", + "rapidcheck", +] + +includes = [ + "pcg/tensor_role.dtg.h", + "task-spec/op_slot_options.dtg.h", + "utils/nonnegative_int/nonnegative_int.h", +] + +[[fields]] +name = "role" +type = "::FlexFlow::TensorRole" + +[[fields]] +name = "slot_option" +type = "::FlexFlow::OpSlotOptions" + +[[fields]] +name = "idx" +type = "::FlexFlow::nonnegative_int" diff --git a/lib/task-spec/include/task-spec/ops/combine.h b/lib/task-spec/include/task-spec/ops/combine.h deleted file mode 100644 index ea7b3ed365..0000000000 --- a/lib/task-spec/include/task-spec/ops/combine.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef _FLEXFLOW_COMBINE_H -#define _FLEXFLOW_COMBINE_H - -#include "op-attrs/ops/combine_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" -#include "task-spec/task_impl_function.dtg.h" - -namespace FlexFlow { - -std::vector get_task_ids(CombineAttrs const &); - -TaskImplFunction get_combine_fwd_task_impl(); -TaskImplFunction get_combine_bwd_task_impl(); - -OpTaskSignature get_combine_fwd_signature(); -OpTaskSignature get_combine_bwd_signature(); - -OpTaskInvocation forward(CombineAttrs const &); -OpTaskInvocation backward(CombineAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/task-spec/include/task-spec/ops/reduction.h b/lib/task-spec/include/task-spec/ops/reduction.h deleted file mode 100644 index 5ddf292672..0000000000 --- a/lib/task-spec/include/task-spec/ops/reduction.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef _FLEXFLOW_REDUCTION_H -#define _FLEXFLOW_REDUCTION_H - -#include "op-attrs/ops/reduction_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" -#include "task-spec/task_impl_function.dtg.h" - -namespace FlexFlow { - -std::vector get_task_ids(ReductionAttrs const &); - -TaskImplFunction get_reduction_fwd_task_impl(); -TaskImplFunction get_reduction_bwd_task_impl(); - -OpTaskSignature get_reduction_fwd_signature(); -OpTaskSignature get_reduction_bwd_signature(); - -OpTaskInvocation init(ReductionAttrs const &); -OpTaskInvocation forward(ReductionAttrs const &); -OpTaskInvocation backward(ReductionAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/task-spec/include/task-spec/ops/repartition.h b/lib/task-spec/include/task-spec/ops/repartition.h deleted file mode 100644 index dfc42c54e5..0000000000 --- a/lib/task-spec/include/task-spec/ops/repartition.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef _FLEXFLOW_PARTITION_H -#define _FLEXFLOW_PARTITION_H - -#include "op-attrs/ops/repartition_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" -#include "task-spec/task_impl_function.dtg.h" - -namespace FlexFlow { - -std::vector get_task_ids(RepartitionAttrs const &); - -TaskImplFunction get_repartition_init_task_impl(); -TaskImplFunction get_repartition_fwd_task_impl(); -TaskImplFunction get_repartition_bwd_task_impl(); - -OpTaskSignature get_repartition_init_signature(); -OpTaskSignature get_repartition_fwd_signature(); -OpTaskSignature get_repartition_bwd_signature(); - -OpTaskInvocation init(RepartitionAttrs const &); -OpTaskInvocation forward(RepartitionAttrs const &); -OpTaskInvocation backward(RepartitionAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/task-spec/include/task-spec/ops/replicate.h b/lib/task-spec/include/task-spec/ops/replicate.h deleted file mode 100644 index 18f6f74b19..0000000000 --- a/lib/task-spec/include/task-spec/ops/replicate.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef _FLEXFLOW_REPLICATE_H -#define _FLEXFLOW_REPLICATE_H - -#include "op-attrs/ops/replicate_attrs.dtg.h" -#include "task-spec/op_task_invocation.h" -#include "task-spec/task_impl_function.dtg.h" - -namespace FlexFlow { - -std::vector get_task_ids(ReplicateAttrs const &); - -TaskImplFunction get_replicate_fwd_task_impl(); -TaskImplFunction get_replicate_bwd_task_impl(); - -OpTaskSignature get_replicate_fwd_signature(); -OpTaskSignature get_replicate_bwd_signature(); - -OpTaskInvocation forward(ReplicateAttrs const &); -OpTaskInvocation backward(ReplicateAttrs const &); - -} // namespace FlexFlow - -#endif diff --git a/lib/task-spec/include/task-spec/ops/reshape.h b/lib/task-spec/include/task-spec/ops/reshape.h index 29d29ae84c..e5bf7170fb 100644 --- a/lib/task-spec/include/task-spec/ops/reshape.h +++ b/lib/task-spec/include/task-spec/ops/reshape.h @@ -9,15 +9,12 @@ namespace FlexFlow { std::vector get_task_ids(ReshapeAttrs const &); -TaskImplFunction get_reshape_init_task_impl(); TaskImplFunction get_reshape_fwd_task_impl(); TaskImplFunction get_reshape_bwd_task_impl(); -OpTaskSignature get_reshape_init_signature(); OpTaskSignature get_reshape_fwd_signature(); OpTaskSignature get_reshape_bwd_signature(); -OpTaskInvocation init(ReshapeAttrs const &); OpTaskInvocation forward(ReshapeAttrs const &); OpTaskInvocation backward(ReshapeAttrs const &); diff --git a/lib/task-spec/include/task-spec/ops/topk.h b/lib/task-spec/include/task-spec/ops/topk.h index 33f2dbc5d7..ca1d43c2ee 100644 --- a/lib/task-spec/include/task-spec/ops/topk.h +++ b/lib/task-spec/include/task-spec/ops/topk.h @@ -1,5 +1,5 @@ -#ifndef _FLEXFLOW_TOPK_H_ -#define _FLEXFLOW_TOPK_H_ +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_TOPK_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPS_TOPK_H #include "op-attrs/ops/topk_attrs.dtg.h" #include "task-spec/op_task_invocation.h" @@ -9,15 +9,12 @@ namespace FlexFlow { std::vector get_task_ids(TopKAttrs const &); -TaskImplFunction get_topk_init_task_impl(); TaskImplFunction get_topk_fwd_task_impl(); TaskImplFunction get_topk_bwd_task_impl(); -OpTaskSignature get_topk_init_signature(); OpTaskSignature get_topk_fwd_signature(); OpTaskSignature get_topk_bwd_signature(); -OpTaskInvocation init(TopKAttrs const &); OpTaskInvocation forward(TopKAttrs const &); OpTaskInvocation backward(TopKAttrs const &); diff --git a/lib/local-execution/include/local-execution/optimizer.h b/lib/task-spec/include/task-spec/optimizer.h similarity index 51% rename from lib/local-execution/include/local-execution/optimizer.h rename to lib/task-spec/include/task-spec/optimizer.h index e4a9c78743..5b898d8699 100644 --- a/lib/local-execution/include/local-execution/optimizer.h +++ b/lib/task-spec/include/task-spec/optimizer.h @@ -1,5 +1,5 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_ -#define _FLEXFLOW_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_OPTIMIZER_H_ +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPTIMIZER_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPTIMIZER_H #include "pcg/optimizer_attrs.dtg.h" #include "pcg/optimizers/adam_optimizer_attrs.dtg.h" @@ -13,24 +13,24 @@ namespace FlexFlow { TaskSignature get_update_signature(OptimizerAttrs const &); TaskInvocation get_update_invocation( OptimizerAttrs const &, - tensor_guid_t const &weight, - gradient_tensor_t const &weight_grad, - std::vector const &grad_buffer_tensors); + forward_tensor_guid_t const &weight, + gradient_tensor_guid_t const &weight_grad, + std::vector const &grad_buffer_tensors); TaskImplFunction get_update_task_impl(OptimizerAttrs const &); TaskSignature get_sgd_update_signature(); TaskInvocation sgd_update(SGDOptimizerAttrs const &, - tensor_guid_t const &weight, - gradient_tensor_t const &weight_grad, - optimizer_tensor_t const &sgd_v); + forward_tensor_guid_t const &weight, + gradient_tensor_guid_t const &weight_grad, + optimizer_tensor_guid_t const &sgd_v); TaskImplFunction get_sgd_update_task_impl(); TaskSignature get_adam_update_signature(); TaskInvocation adam_update(AdamOptimizerAttrs const &, - tensor_guid_t const &weight, - gradient_tensor_t const &weight_grad, - optimizer_tensor_t const &adam_v, - optimizer_tensor_t const &adam_m); + forward_tensor_guid_t const &weight, + gradient_tensor_guid_t const &weight_grad, + optimizer_tensor_guid_t const &adam_v, + optimizer_tensor_guid_t const &adam_m); TaskImplFunction get_adam_update_task_impl(); } // namespace FlexFlow diff --git a/lib/task-spec/include/task-spec/lowered_tensor_t.struct.toml b/lib/task-spec/include/task-spec/optimizer_tensor_guid_t.struct.toml similarity index 78% rename from lib/task-spec/include/task-spec/lowered_tensor_t.struct.toml rename to lib/task-spec/include/task-spec/optimizer_tensor_guid_t.struct.toml index 287e548a5b..dc5f98886f 100644 --- a/lib/task-spec/include/task-spec/lowered_tensor_t.struct.toml +++ b/lib/task-spec/include/task-spec/optimizer_tensor_guid_t.struct.toml @@ -1,5 +1,5 @@ namespace = "FlexFlow" -name = "lowered_tensor_t" +name = "optimizer_tensor_guid_t" features = [ "eq", "ord", diff --git a/lib/task-spec/include/task-spec/optimizer_tensor_source.h b/lib/task-spec/include/task-spec/optimizer_tensor_source.h new file mode 100644 index 0000000000..2f10c5c35b --- /dev/null +++ b/lib/task-spec/include/task-spec/optimizer_tensor_source.h @@ -0,0 +1,22 @@ +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPTIMIZER_TENSOR_SOURCE_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_OPTIMIZER_TENSOR_SOURCE_H + +#include "task-spec/optimizer_tensor_guid_t.dtg.h" + +namespace FlexFlow { + +struct OptimizerTensorSource { +public: + OptimizerTensorSource(); + + optimizer_tensor_guid_t new_optimizer_tensor(); + + void reset(); + +private: + static int next_available_optimizer_tensor_id; +}; + +} // namespace FlexFlow + +#endif diff --git a/lib/task-spec/include/task-spec/parallel_tensor_shape_ref_type.struct.toml b/lib/task-spec/include/task-spec/parallel_tensor_shape_ref_type.struct.toml index fe340f4451..4ff411d17b 100644 --- a/lib/task-spec/include/task-spec/parallel_tensor_shape_ref_type.struct.toml +++ b/lib/task-spec/include/task-spec/parallel_tensor_shape_ref_type.struct.toml @@ -1,6 +1,5 @@ namespace = "FlexFlow" name = "ParallelTensorShapeRefType" - features = [ "eq", "ord", @@ -9,6 +8,15 @@ features = [ "fmt", ] +includes = [ + "utils/nonnegative_int/nonnegative_int.h", + "pcg/tensor_role.dtg.h", +] + +[[fields]] +name = "tensor_role" +type = "::FlexFlow::TensorRole" + [[fields]] name = "idx" -type = "int" +type = "::FlexFlow::nonnegative_int" diff --git a/lib/task-spec/include/task-spec/per_device_op_state.h b/lib/task-spec/include/task-spec/per_device_op_state.h index 23312d90a5..ae6c93807c 100644 --- a/lib/task-spec/include/task-spec/per_device_op_state.h +++ b/lib/task-spec/include/task-spec/per_device_op_state.h @@ -1,8 +1,10 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H -#define _FLEXFLOW_LOCAL_EXECUTION_PER_DEVICE_STATE_H +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_PER_DEVICE_OP_STATE_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_PER_DEVICE_OP_STATE_H +#include "task-spec/concrete_arg_spec.h" #include "task-spec/device_specific_device_states.dtg.h" #include "task-spec/per_device_op_state.dtg.h" +#include "utils/type_index.h" namespace FlexFlow { diff --git a/lib/task-spec/include/task-spec/per_device_op_state.variant.toml b/lib/task-spec/include/task-spec/per_device_op_state.variant.toml index 0171e3e497..7c340447f9 100644 --- a/lib/task-spec/include/task-spec/per_device_op_state.variant.toml +++ b/lib/task-spec/include/task-spec/per_device_op_state.variant.toml @@ -3,80 +3,70 @@ name = "PerDeviceOpState" features = [] includes = [ - "kernels/attention_kernels.h", - "kernels/batch_norm_kernels.h", - "kernels/conv_2d_kernels.h", - "kernels/dropout_kernels.h", - "kernels/element_binary_kernels.h", - "kernels/element_unary_kernels.h", - "kernels/gather_kernels.h", - "kernels/layer_norm_kernels.h", - "kernels/linear_kernels.h", - "kernels/partition_kernels.h", - "kernels/pool_2d_kernels.h", - "kernels/reduce_kernels.h", - "kernels/reduction_kernels.h", - "kernels/reshape_kernels.h", - "kernels/softmax_kernels.h", - "kernels/topk_kernels.h", + "kernels/mha_per_device_state.dtg.h", + "kernels/batch_norm_per_device_state.dtg.h", + "kernels/conv_2d_per_device_state.dtg.h", + "kernels/dropout_per_device_state.dtg.h", + "kernels/element_binary_per_device_state.dtg.h", + "kernels/element_unary_per_device_state.dtg.h", + "kernels/gather_per_device_state.dtg.h", + "kernels/layer_norm_per_device_state.dtg.h", + "kernels/linear_per_device_state.dtg.h", + "kernels/partition_per_device_state.dtg.h", + "kernels/pool_2d_per_device_state.dtg.h", + "kernels/reduce_per_device_state.dtg.h", + "kernels/softmax_per_device_state.dtg.h", + "", ] [[values]] -type = "::FlexFlow::MHAPerDeviceState" +type = "std::optional<::FlexFlow::MHAPerDeviceState>" key = "mha_per_device_state" [[values]] -type = "::FlexFlow::BatchNormPerDeviceState" +type = "std::optional<::FlexFlow::BatchNormPerDeviceState>" key = "batch_norm_per_device_state" [[values]] -type = "::FlexFlow::Conv2DPerDeviceState" +type = "std::optional<::FlexFlow::Conv2DPerDeviceState>" key = "conv2d_per_device_state" [[values]] -type = "::FlexFlow::DropoutPerDeviceState" +type = "std::optional<::FlexFlow::DropoutPerDeviceState>" key = "dropout_per_device_state" [[values]] -type = "::FlexFlow::ElementBinaryPerDeviceState" +type = "std::optional<::FlexFlow::ElementBinaryPerDeviceState>" key = "element_binary_per_device_state" [[values]] -type = "::FlexFlow::ElementUnaryPerDeviceState" +type = "std::optional<::FlexFlow::ElementUnaryPerDeviceState>" key = "element_unary_per_device_state" [[values]] -type = "::FlexFlow::GatherPerDeviceState" +type = "std::optional<::FlexFlow::GatherPerDeviceState>" key = "gather_per_device_state" [[values]] -type = "::FlexFlow::LayerNormPerDeviceState" +type = "std::optional<::FlexFlow::LayerNormPerDeviceState>" key = "layer_norm_per_device_state" [[values]] -type = "::FlexFlow::LinearPerDeviceState" +type = "std::optional<::FlexFlow::LinearPerDeviceState>" key = "linear_per_device_state" [[values]] -type = "::FlexFlow::Pool2DPerDeviceState" +type = "std::optional<::FlexFlow::Pool2DPerDeviceState>" key = "pool_2d_per_device_state" [[values]] -type = "::FlexFlow::ReducePerDeviceState" +type = "std::optional<::FlexFlow::ReducePerDeviceState>" key = "reduce_per_device_state" [[values]] -type = "::FlexFlow::RepartitionPerDeviceState" +type = "std::optional<::FlexFlow::RepartitionPerDeviceState>" key = "repartition_per_device_state" [[values]] -type = "::FlexFlow::ReshapePerDeviceState" -key = "reshape_per_device_state" - -[[values]] -type = "::FlexFlow::SoftmaxPerDeviceState" +type = "std::optional<::FlexFlow::SoftmaxPerDeviceState>" key = "softmax_per_device_state" - -[[values]] -type = "::FlexFlow::TopKPerDeviceState" -key = "topk_per_device_state" diff --git a/lib/task-spec/include/task-spec/profiling.h b/lib/task-spec/include/task-spec/profiling.h index bd50801fc4..91774f69ef 100644 --- a/lib/task-spec/include/task-spec/profiling.h +++ b/lib/task-spec/include/task-spec/profiling.h @@ -9,10 +9,13 @@ namespace FlexFlow { enum class EnableProfiling { YES, NO }; template -std::optional - profile(F const &f, ProfilingSettings profiling, Str s, Ts &&...ts) { - std::optional elapsed = - profiling_wrapper(f, profiling, std::forward(ts)...); +std::optional profile(F const &f, + ProfilingSettings profiling, + DeviceType device_type, + Str s, + Ts &&...ts) { + std::optional elapsed = profiling_wrapper( + f, profiling, device_type, std::forward(ts)...); if (elapsed.has_value()) { spdlog::debug(s, elapsed.value()); } diff --git a/lib/task-spec/include/task-spec/runtime_arg_config.h b/lib/task-spec/include/task-spec/runtime_arg_config.h index f4320bc40b..5358caf331 100644 --- a/lib/task-spec/include/task-spec/runtime_arg_config.h +++ b/lib/task-spec/include/task-spec/runtime_arg_config.h @@ -1,18 +1,17 @@ -#ifndef _FLEXFLOW_LOCAL_EXECUTION_RUNTIME_ARG_CONFIG_H -#define _FLEXFLOW_LOCAL_EXECUTION_RUNTIME_ARG_CONFIG_H +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_RUNTIME_ARG_CONFIG_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_RUNTIME_ARG_CONFIG_H -#include "kernels/ff_handle.h" -#include "task-spec/device_specific.h" -#include "task-spec/profiling.h" +#include "task-spec/runtime_arg_config.dtg.h" namespace FlexFlow { -struct RuntimeArgConfig { -public: - DeviceSpecific ff_handle; - EnableProfiling enable_profiling; - ProfilingSettings profiling_settings; -}; +RuntimeArgConfig + cpu_make_runtime_arg_config(EnableProfiling enable_profiling, + ProfilingSettings profiling_settings); +RuntimeArgConfig + gpu_make_runtime_arg_config(PerDeviceFFHandle const &ff_handle, + EnableProfiling enable_profiling, + ProfilingSettings profiling_settings); } // namespace FlexFlow diff --git a/lib/task-spec/include/task-spec/runtime_arg_config.struct.toml b/lib/task-spec/include/task-spec/runtime_arg_config.struct.toml new file mode 100644 index 0000000000..9d77616306 --- /dev/null +++ b/lib/task-spec/include/task-spec/runtime_arg_config.struct.toml @@ -0,0 +1,25 @@ +namespace = "FlexFlow" +name = "RuntimeArgConfig" +features = [] + +includes = [ + "kernels/device_handle_t.dtg.h", + "task-spec/device_specific.h", + "task-spec/profiling.h", +] + +[[fields]] +name = "ff_handle" +type = "::FlexFlow::DeviceSpecific<::FlexFlow::device_handle_t>" + +[[fields]] +name = "enable_profiling" +type = "::FlexFlow::EnableProfiling" + +[[fields]] +name = "profiling_settings" +type = "::FlexFlow::ProfilingSettings" + +[[fields]] +name = "kernel_device_type" +type = "::FlexFlow::DeviceType" diff --git a/lib/task-spec/include/task-spec/runtime_arg_ref.h b/lib/task-spec/include/task-spec/runtime_arg_ref.h index 33fccb0106..532482f89e 100644 --- a/lib/task-spec/include/task-spec/runtime_arg_ref.h +++ b/lib/task-spec/include/task-spec/runtime_arg_ref.h @@ -1,31 +1,25 @@ #ifndef _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_RUNTIME_ARG_REF_H #define _FLEXFLOW_RUNTIME_SRC_TASK_SPEC_RUNTIME_ARG_REF_H +#include "kernels/device_handle_t.dtg.h" +#include "kernels/profiling_settings.dtg.h" +#include "pcg/device_type.dtg.h" #include "task-spec/arg_ref.h" #include "task-spec/config.h" #include "task-spec/device_specific.h" -#include "task-spec/profiling.h" -#include "utils/fmt.h" -#include "utils/type_index.h" +#include "task-spec/runtime_arg_ref_type.dtg.h" namespace FlexFlow { -enum class RuntimeArgRefType { - FF_HANDLE, - PROFILING_SETTINGS, - FF_ITERATION_CONFIG -}; - -std::string to_string(RuntimeArgRefType const &); - template using RuntimeArgRef = ArgRef; using RuntimeArgRefSpec = ArgRefSpec; RuntimeArgRef profiling_settings(); -RuntimeArgRef> ff_handle(); +RuntimeArgRef> ff_handle(); RuntimeArgRef iteration_config(); +RuntimeArgRef kernel_device_type(); } // namespace FlexFlow diff --git a/lib/task-spec/include/task-spec/runtime_arg_ref_type.enum.toml b/lib/task-spec/include/task-spec/runtime_arg_ref_type.enum.toml new file mode 100644 index 0000000000..e33eeebc56 --- /dev/null +++ b/lib/task-spec/include/task-spec/runtime_arg_ref_type.enum.toml @@ -0,0 +1,17 @@ +namespace = "FlexFlow" +name = "RuntimeArgRefType" +features = [ + "fmt", +] + +[[values]] +name = "FF_HANDLE" + +[[values]] +name = "PROFILING_SETTINGS" + +[[values]] +name = "FF_ITERATION_CONFIG" + +[[values]] +name = "KERNEL_DEVICE_TYPE" diff --git a/lib/task-spec/include/task-spec/task_arg_spec.variant.toml b/lib/task-spec/include/task-spec/task_arg_spec.variant.toml index 0f81f93405..4829a50ff6 100644 --- a/lib/task-spec/include/task-spec/task_arg_spec.variant.toml +++ b/lib/task-spec/include/task-spec/task_arg_spec.variant.toml @@ -7,7 +7,7 @@ features = [ ] includes = [ - "task-spec/concrete_arg.h", + "task-spec/concrete_arg_spec.h", "task-spec/runtime_arg_ref.h" ] diff --git a/lib/task-spec/include/task-spec/task_argument_accessor.h b/lib/task-spec/include/task-spec/task_argument_accessor.h index c1c42e09a3..a6d71b6b70 100644 --- a/lib/task-spec/include/task-spec/task_argument_accessor.h +++ b/lib/task-spec/include/task-spec/task_argument_accessor.h @@ -11,19 +11,7 @@ struct TaskArgumentAccessor { // arguments template T const &get_argument(slot_id_t slot) const { - if constexpr (PerDeviceOpState::IsPartOfPerDeviceOpState_v) { - PerDeviceOpState device_states = - this->ptr->get_concrete_arg(slot).get(); - if (device_states.has()) { - return device_states.get(); - } else { - throw mk_runtime_error(fmt::format( - "Invalid access to PerDeviceOpState attempted, instead it holds: ", - device_states.index())); - } - } else { - return this->ptr->get_concrete_arg(slot).get(); - } + return this->ptr->get_concrete_arg(slot).get(); } template diff --git a/lib/task-spec/include/task-spec/task_binding.h b/lib/task-spec/include/task-spec/task_binding.h index a945fec1d7..4cc286e104 100644 --- a/lib/task-spec/include/task-spec/task_binding.h +++ b/lib/task-spec/include/task-spec/task_binding.h @@ -1,32 +1,36 @@ #ifndef _FLEXFLOW_LOCAL_EXECUTION_TASK_BINDING_H #define _FLEXFLOW_LOCAL_EXECUTION_TASK_BINDING_H -#include "task-spec/loss_tensor_t.dtg.h" -#include "task-spec/lowered_tensor_t.dtg.h" -#include "task-spec/optimizer_tensor_t.dtg.h" +#include "task-spec/loss_tensor_guid_t.dtg.h" +#include "task-spec/optimizer_tensor_guid_t.dtg.h" #include "task-spec/slot_id_t.dtg.h" -#include "task-spec/slot_tensor_type_id.dtg.h" #include "task-spec/task_arg_spec.dtg.h" #include "task-spec/task_id_t.dtg.h" #include "task-spec/task_signature.dtg.h" -#include "task-spec/tensor_type_t.dtg.h" +#include "task-spec/tensor_sub_slot_id_t.dtg.h" +#include "task-spec/training_tensor_guid_t.dtg.h" namespace FlexFlow { struct TaskBinding { - TaskBinding() = default; + TaskBinding(); - void bind(int, tensor_guid_t const &); - void bind(slot_id_t, tensor_guid_t const &); + explicit TaskBinding( + std::unordered_map const + &tensor_bindings, + std::unordered_map const &arg_bindings); - void bind_grad(int, gradient_tensor_t const &); - void bind_grad(slot_id_t, gradient_tensor_t const &); + void bind(int, forward_tensor_guid_t const &); + void bind(slot_id_t, forward_tensor_guid_t const &); - void bind_optimizer(int, optimizer_tensor_t const &); - void bind_optimizer(slot_id_t, optimizer_tensor_t const &); + void bind_grad(int, gradient_tensor_guid_t const &); + void bind_grad(slot_id_t, gradient_tensor_guid_t const &); - void bind_loss(int, loss_tensor_t const &); - void bind_loss(slot_id_t, loss_tensor_t const &); + void bind_optimizer(int, optimizer_tensor_guid_t const &); + void bind_optimizer(slot_id_t, optimizer_tensor_guid_t const &); + + void bind_loss(int, loss_tensor_guid_t const &); + void bind_loss(slot_id_t, loss_tensor_guid_t const &); template void bind_arg(int name, T const &t) { @@ -51,18 +55,21 @@ struct TaskBinding { bool operator==(TaskBinding const &other) const; bool operator!=(TaskBinding const &other) const; - std::unordered_map const & + std::unordered_map const & get_tensor_bindings() const; std::unordered_map const &get_arg_bindings() const; void insert_arg_spec(slot_id_t name, TaskArgSpec const &arg_spec); private: - std::unordered_map tensor_bindings; + std::unordered_map + tensor_bindings; std::unordered_map arg_bindings; private: std::tuple tie() const; + + friend ::std::hash; }; std::string format_as(TaskBinding const &x); diff --git a/lib/task-spec/include/task-spec/task_id_t.enum.toml b/lib/task-spec/include/task-spec/task_id_t.enum.toml index b0c82b5d26..2e8f0a0046 100644 --- a/lib/task-spec/include/task-spec/task_id_t.enum.toml +++ b/lib/task-spec/include/task-spec/task_id_t.enum.toml @@ -106,9 +106,6 @@ name = "BATCHNORM_FWD_TASK_ID" [[values]] name = "BATCHNORM_BWD_TASK_ID" -[[values]] -name = "BATCHMATMUL_INIT_TASK_ID" - [[values]] name = "BATCHMATMUL_FWD_TASK_ID" @@ -178,9 +175,6 @@ name = "REDUCE_FWD_TASK_ID" [[values]] name = "REDUCE_BWD_TASK_ID" -[[values]] -name = "RESHAPE_INIT_TASK_ID" - [[values]] name = "RESHAPE_FWD_TASK_ID" @@ -196,9 +190,6 @@ name = "REVERSE_FWD_TASK_ID" [[values]] name = "REVERSE_BWD_TASK_ID" -[[values]] -name = "TOPK_INIT_TASK_ID" - [[values]] name = "TOPK_FWD_TASK_ID" diff --git a/lib/task-spec/include/task-spec/task_signature_impl.h b/lib/task-spec/include/task-spec/task_signature_impl.h index fcf9b346cf..a781e53485 100644 --- a/lib/task-spec/include/task-spec/task_signature_impl.h +++ b/lib/task-spec/include/task-spec/task_signature_impl.h @@ -8,12 +8,14 @@ namespace FlexFlow { -TaskSignatureAndImpl get_task_sig_impl(task_id_t const &); +TaskSignatureAndImpl get_task_signature_and_impl_for_task_id(task_id_t const &); std::vector get_task_ids(ComputationGraphOpAttrs const &); -OpTaskInvocation init(ComputationGraphOpAttrs const &); -OpTaskInvocation forward(ComputationGraphOpAttrs const &); -OpTaskInvocation backward(ComputationGraphOpAttrs const &); +OpTaskInvocation get_init_op_task_invocation(ComputationGraphOpAttrs const &); +OpTaskInvocation + get_forward_op_task_invocation(ComputationGraphOpAttrs const &); +OpTaskInvocation + get_backward_op_task_invocation(ComputationGraphOpAttrs const &); } // namespace FlexFlow diff --git a/lib/task-spec/include/task-spec/slot_tensor_type_id.struct.toml b/lib/task-spec/include/task-spec/tensor_sub_slot_id_t.struct.toml similarity index 90% rename from lib/task-spec/include/task-spec/slot_tensor_type_id.struct.toml rename to lib/task-spec/include/task-spec/tensor_sub_slot_id_t.struct.toml index ab5b981637..a830725a27 100644 --- a/lib/task-spec/include/task-spec/slot_tensor_type_id.struct.toml +++ b/lib/task-spec/include/task-spec/tensor_sub_slot_id_t.struct.toml @@ -1,5 +1,5 @@ namespace = "FlexFlow" -name = "SlotTensorTypeId" +name = "tensor_sub_slot_id_t" features = [ "eq", "ord", diff --git a/lib/task-spec/include/task-spec/tensor_type_t.variant.toml b/lib/task-spec/include/task-spec/tensor_type_t.variant.toml deleted file mode 100644 index b93ed91081..0000000000 --- a/lib/task-spec/include/task-spec/tensor_type_t.variant.toml +++ /dev/null @@ -1,31 +0,0 @@ -namespace = "FlexFlow" -name = "TensorTypeVariant" -features = [ - "eq", - "ord", - "hash", - "fmt", -] - -includes = [ - "pcg/tensor_guid_t.dtg.h", - "task-spec/optimizer_tensor_t.dtg.h", - "task-spec/gradient_tensor_t.dtg.h", - "task-spec/loss_tensor_t.dtg.h" -] - -[[values]] -type = "::FlexFlow::tensor_guid_t" -key = "tensor_guid" - -[[values]] -type = "::FlexFlow::gradient_tensor_t" -key = "gradient_tensor" - -[[values]] -type = "::FlexFlow::optimizer_tensor_t" -key = "optimizer_tensor" - -[[values]] -type = "::FlexFlow::loss_tensor_t" -key = "loss_tensor" diff --git a/lib/task-spec/include/task-spec/training_computation_graph.h b/lib/task-spec/include/task-spec/training_computation_graph.h new file mode 100644 index 0000000000..1cda57a49e --- /dev/null +++ b/lib/task-spec/include/task-spec/training_computation_graph.h @@ -0,0 +1,68 @@ +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TRAINING_COMPUTATION_GRAPH_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TRAINING_COMPUTATION_GRAPH_H + +#include "pcg/optimizer_attrs.dtg.h" +#include "task-spec/forward_tensor_source.h" +#include "task-spec/gradient_tensor_source.h" +#include "task-spec/loss_tensor_source.h" +#include "task-spec/optimizer_tensor_source.h" +#include "task-spec/training_computation_graph.dtg.h" +#include "task-spec/training_layer_plus_context.dtg.h" +#include "task-spec/training_tensor_guid_t.dtg.h" + +namespace FlexFlow { + +TrainingComputationGraph generate_training_computation_graph( + ComputationGraph const &computation_graph, + OptimizerAttrs const &optimizer_attrs, + tensor_guid_t const &logit_tensor, + ForwardTensorSource &forward_tensor_source, + GradientTensorSource &gradient_tensor_source, + OptimizerTensorSource &optimizer_tensor_source, + LossTensorSource &loss_tensor_source); + +TrainingTensorGroup + get_training_tensor_group_for_tensor_guid(TrainingComputationGraph const &, + tensor_guid_t); +TrainingTensorGroupWithAttrs + get_training_tensor_group_with_attrs_for_tensor_guid( + TrainingComputationGraph const &, tensor_guid_t); + +forward_tensor_guid_t + get_forward_tensor_guid_for_tensor_guid(TrainingComputationGraph const &, + tensor_guid_t); +gradient_tensor_guid_t + get_gradient_tensor_guid_for_tensor_guid(TrainingComputationGraph const &, + tensor_guid_t); +std::vector + get_optimizer_tensor_guids_for_tensor_guid(TrainingComputationGraph const &, + tensor_guid_t); + +tensor_guid_t + get_tensor_guid_for_forward_tensor_guid(TrainingComputationGraph const &, + forward_tensor_guid_t); +tensor_guid_t + get_tensor_guid_for_gradient_tensor_guid(TrainingComputationGraph const &, + gradient_tensor_guid_t); +tensor_guid_t + get_tensor_guid_for_optimizer_tensor_guid(TrainingComputationGraph const &, + optimizer_tensor_guid_t); + +tensor_guid_t + get_tensor_guid_for_training_tensor_guid(TrainingComputationGraph const &, + training_tensor_guid_t); + +std::unordered_set + get_all_training_tensors_in_training_computation_graph( + TrainingComputationGraph const &); + +TrainingLayerPlusContext + get_training_layer_plus_context(TrainingComputationGraph const &, + layer_guid_t); + +std::unordered_map + get_all_training_tensor_shapes(TrainingComputationGraph const &); + +} // namespace FlexFlow + +#endif diff --git a/lib/task-spec/include/task-spec/training_computation_graph.struct.toml b/lib/task-spec/include/task-spec/training_computation_graph.struct.toml new file mode 100644 index 0000000000..1e294df7eb --- /dev/null +++ b/lib/task-spec/include/task-spec/training_computation_graph.struct.toml @@ -0,0 +1,27 @@ +namespace = "FlexFlow" +name = "TrainingComputationGraph" +features = [] + +includes = [ + "pcg/computation_graph.h", + "", + "pcg/tensor_guid_t.dtg.h", + "task-spec/training_tensor_group.dtg.h", + "task-spec/loss_tensor_guid_t.dtg.h", +] + +[[fields]] +name = "computation_graph" +type = "::FlexFlow::ComputationGraph" + +[[fields]] +name = "training_tensor_group_for_tensor" +type = "std::unordered_map" + +[[fields]] +name = "logit_tensor" +type = "::FlexFlow::tensor_guid_t" + +[[fields]] +name = "label_tensor" +type = "::FlexFlow::loss_tensor_guid_t" diff --git a/lib/task-spec/include/task-spec/training_layer_plus_context.h b/lib/task-spec/include/task-spec/training_layer_plus_context.h new file mode 100644 index 0000000000..4ce1ddf1a9 --- /dev/null +++ b/lib/task-spec/include/task-spec/training_layer_plus_context.h @@ -0,0 +1,50 @@ +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TRAINING_LAYER_PLUS_CONTEXT_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TRAINING_LAYER_PLUS_CONTEXT_H + +#include "pcg/cg_operator_tensor_shape_signature.dtg.h" +#include "pcg/tensor_role.dtg.h" +#include "task-spec/training_layer_plus_context.dtg.h" +#include "task-spec/training_layer_tensor_group_signature.dtg.h" + +namespace FlexFlow { + +std::vector + get_training_tensor_groups_with_attrs_for_role( + TrainingLayerPlusContext const &training_layer_plus_context, + TensorRole tensor_role); + +TrainingTensorGroupWithAttrs + get_training_tensor_group_with_attrs_for_role_and_index( + TrainingLayerPlusContext const &training_layer_plus_context, + TensorRole tensor_role, + nonnegative_int index); + +std::vector + get_input_tensors(TrainingLayerPlusContext const &); +std::vector + get_input_grad_tensors(TrainingLayerPlusContext const &); +std::vector + get_input_tensor_shapes(TrainingLayerPlusContext const &); + +std::vector + get_weight_tensors(TrainingLayerPlusContext const &); +std::vector + get_weight_grad_tensors(TrainingLayerPlusContext const &); +std::vector + get_weight_tensor_shapes(TrainingLayerPlusContext const &); + +std::vector + get_output_tensors(TrainingLayerPlusContext const &); +std::vector + get_output_grad_tensors(TrainingLayerPlusContext const &); +std::vector + get_output_tensor_shapes(TrainingLayerPlusContext const &); + +TrainingLayerTensorGroupSignature + get_tensor_group_signature(TrainingLayerPlusContext const &); +CGOperatorTensorShapeSignature + get_cg_op_shape_signature(TrainingLayerPlusContext const &); + +} // namespace FlexFlow + +#endif diff --git a/lib/task-spec/include/task-spec/training_layer_plus_context.struct.toml b/lib/task-spec/include/task-spec/training_layer_plus_context.struct.toml new file mode 100644 index 0000000000..9090059351 --- /dev/null +++ b/lib/task-spec/include/task-spec/training_layer_plus_context.struct.toml @@ -0,0 +1,29 @@ +namespace = "FlexFlow" +name = "TrainingLayerPlusContext" +features = [] + +includes = [ + "pcg/layer_guid_t.dtg.h", + "pcg/layer_attrs.dtg.h", + "task-spec/training_tensor_group_with_attrs.dtg.h", +] + +[[fields]] +name = "layer_guid" +type = "::FlexFlow::layer_guid_t" + +[[fields]] +name = "layer_attrs" +type = "::FlexFlow::LayerAttrs" + +[[fields]] +name = "input_tensor_groups" +type = "std::vector<::FlexFlow::TrainingTensorGroupWithAttrs>" + +[[fields]] +name = "weight_tensor_groups" +type = "std::vector<::FlexFlow::TrainingTensorGroupWithAttrs>" + +[[fields]] +name = "output_tensor_groups" +type = "std::vector<::FlexFlow::TrainingTensorGroupWithAttrs>" diff --git a/lib/task-spec/include/task-spec/training_layer_tensor_group_signature.h b/lib/task-spec/include/task-spec/training_layer_tensor_group_signature.h new file mode 100644 index 0000000000..62b11e3af3 --- /dev/null +++ b/lib/task-spec/include/task-spec/training_layer_tensor_group_signature.h @@ -0,0 +1,20 @@ +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TRAINING_LAYER_TENSOR_GROUP_SIGNATURE_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TRAINING_LAYER_TENSOR_GROUP_SIGNATURE_H + +#include "pcg/tensor_role.dtg.h" +#include "task-spec/training_layer_tensor_group_signature.dtg.h" +#include "utils/nonnegative_int/nonnegative_int.h" + +namespace FlexFlow { + +std::vector get_training_tensor_groups_for_role( + TrainingLayerTensorGroupSignature const &signature, TensorRole tensor_role); + +TrainingTensorGroup get_training_tensor_group_for_role_and_index( + TrainingLayerTensorGroupSignature const &signature, + TensorRole tensor_role, + nonnegative_int index); + +} // namespace FlexFlow + +#endif diff --git a/lib/task-spec/include/task-spec/training_layer_tensor_group_signature.struct.toml b/lib/task-spec/include/task-spec/training_layer_tensor_group_signature.struct.toml new file mode 100644 index 0000000000..d9859559a1 --- /dev/null +++ b/lib/task-spec/include/task-spec/training_layer_tensor_group_signature.struct.toml @@ -0,0 +1,19 @@ +namespace = "FlexFlow" +name = "TrainingLayerTensorGroupSignature" +features = [] + +includes = [ + "task-spec/training_tensor_group.dtg.h", +] + +[[fields]] +name = "input_tensor_groups" +type = "std::vector<::FlexFlow::TrainingTensorGroup>" + +[[fields]] +name = "weight_tensor_groups" +type = "std::vector<::FlexFlow::TrainingTensorGroup>" + +[[fields]] +name = "output_tensor_groups" +type = "std::vector<::FlexFlow::TrainingTensorGroup>" diff --git a/lib/task-spec/include/task-spec/training_tensor_group.h b/lib/task-spec/include/task-spec/training_tensor_group.h new file mode 100644 index 0000000000..40269ceab0 --- /dev/null +++ b/lib/task-spec/include/task-spec/training_tensor_group.h @@ -0,0 +1,28 @@ +#ifndef _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_TRAINING_TENSOR_GROUP_H +#define _FLEXFLOW_LIB_LOCAL_EXECUTION_INCLUDE_LOCAL_EXECUTION_TRAINING_TENSOR_GROUP_H + +#include "pcg/optimizer_attrs.dtg.h" +#include "pcg/tensor_attrs.dtg.h" +#include "pcg/tensor_guid_t.dtg.h" +#include "task-spec/forward_tensor_source.h" +#include "task-spec/gradient_tensor_source.h" +#include "task-spec/optimizer_tensor_source.h" +#include "task-spec/training_tensor_group.dtg.h" +#include "task-spec/training_tensor_guid_t.dtg.h" + +namespace FlexFlow { + +TrainingTensorGroup make_training_tensor_group_for_tensor_guid_t( + tensor_guid_t tensor_guid, + TensorAttrs const &tensor_attrs, + OptimizerAttrs const &optimizer_attrs, + ForwardTensorSource &forward_tensor_source, + GradientTensorSource &gradient_tensor_source, + OptimizerTensorSource &optimizer_tensor_source); + +std::unordered_set + get_all_training_tensors_in_tensor_group(TrainingTensorGroup const &); + +} // namespace FlexFlow + +#endif diff --git a/lib/task-spec/include/task-spec/training_tensor_group.struct.toml b/lib/task-spec/include/task-spec/training_tensor_group.struct.toml new file mode 100644 index 0000000000..eadaac08ad --- /dev/null +++ b/lib/task-spec/include/task-spec/training_tensor_group.struct.toml @@ -0,0 +1,31 @@ +namespace = "FlexFlow" +name = "TrainingTensorGroup" +features = [ + "eq", + "ord", + "fmt", + "hash", +] + +includes = [ + "task-spec/forward_tensor_guid_t.dtg.h", + "task-spec/gradient_tensor_guid_t.dtg.h", + "task-spec/optimizer_tensor_guid_t.dtg.h", +] + +src_includes = [ + "utils/hash/vector.h", + "utils/fmt/vector.h", +] + +[[fields]] +name = "forward_tensor" +type = "::FlexFlow::forward_tensor_guid_t" + +[[fields]] +name = "gradient_tensor" +type = "::FlexFlow::gradient_tensor_guid_t" + +[[fields]] +name = "optimizer_tensors" +type = "std::vector<::FlexFlow::optimizer_tensor_guid_t>" diff --git a/lib/task-spec/include/task-spec/training_tensor_group_with_attrs.h b/lib/task-spec/include/task-spec/training_tensor_group_with_attrs.h new file mode 100644 index 0000000000..2560228b1c --- /dev/null +++ b/lib/task-spec/include/task-spec/training_tensor_group_with_attrs.h @@ -0,0 +1,18 @@ +#ifndef _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TRAINING_TENSOR_GROUP_WITH_ATTRS_H +#define _FLEXFLOW_LIB_TASK_SPEC_INCLUDE_TASK_SPEC_TRAINING_TENSOR_GROUP_WITH_ATTRS_H + +#include "task-spec/training_tensor_group.dtg.h" +#include "task-spec/training_tensor_group_with_attrs.dtg.h" + +namespace FlexFlow { + +TrainingTensorGroupWithAttrs + make_training_tensor_group_with_attrs_from_group_and_attrs( + TrainingTensorGroup const &group, TensorAttrs const &attrs); + +TrainingTensorGroup + tensor_group_without_attrs(TrainingTensorGroupWithAttrs const &); + +} // namespace FlexFlow + +#endif diff --git a/lib/task-spec/include/task-spec/training_tensor_group_with_attrs.struct.toml b/lib/task-spec/include/task-spec/training_tensor_group_with_attrs.struct.toml new file mode 100644 index 0000000000..5816214fb3 --- /dev/null +++ b/lib/task-spec/include/task-spec/training_tensor_group_with_attrs.struct.toml @@ -0,0 +1,37 @@ +namespace = "FlexFlow" +name = "TrainingTensorGroupWithAttrs" +features = [ + "eq", + "ord", + "fmt", + "hash", +] + +includes = [ + "pcg/tensor_attrs.dtg.h", + "task-spec/forward_tensor_guid_t.dtg.h", + "task-spec/gradient_tensor_guid_t.dtg.h", + "task-spec/optimizer_tensor_guid_t.dtg.h", +] + +src_includes = [ + "utils/hash/vector.h", + "utils/fmt/vector.h", +] + +[[fields]] +name = "tensor_attrs" +type = "::FlexFlow::TensorAttrs" + +[[fields]] +name = "forward_tensor" +type = "::FlexFlow::forward_tensor_guid_t" + +[[fields]] +name = "gradient_tensor" +type = "::FlexFlow::gradient_tensor_guid_t" + +[[fields]] +name = "optimizer_tensors" +type = "std::vector<::FlexFlow::optimizer_tensor_guid_t>" + diff --git a/lib/task-spec/include/task-spec/training_tensor_guid_t.variant.toml b/lib/task-spec/include/task-spec/training_tensor_guid_t.variant.toml new file mode 100644 index 0000000000..d2520dacbf --- /dev/null +++ b/lib/task-spec/include/task-spec/training_tensor_guid_t.variant.toml @@ -0,0 +1,31 @@ +namespace = "FlexFlow" +name = "training_tensor_guid_t" +features = [ + "eq", + "ord", + "hash", + "fmt", +] + +includes = [ + "task-spec/forward_tensor_guid_t.dtg.h", + "task-spec/optimizer_tensor_guid_t.dtg.h", + "task-spec/gradient_tensor_guid_t.dtg.h", + "task-spec/loss_tensor_guid_t.dtg.h" +] + +[[values]] +type = "::FlexFlow::forward_tensor_guid_t" +key = "forward_tensor" + +[[values]] +type = "::FlexFlow::gradient_tensor_guid_t" +key = "gradient_tensor" + +[[values]] +type = "::FlexFlow::optimizer_tensor_guid_t" +key = "optimizer_tensor" + +[[values]] +type = "::FlexFlow::loss_tensor_guid_t" +key = "loss_tensor" diff --git a/lib/task-spec/src/task-spec/concrete_arg.cc b/lib/task-spec/src/task-spec/concrete_arg_spec.cc similarity index 94% rename from lib/task-spec/src/task-spec/concrete_arg.cc rename to lib/task-spec/src/task-spec/concrete_arg_spec.cc index b67b74b19a..05fd703df1 100644 --- a/lib/task-spec/src/task-spec/concrete_arg.cc +++ b/lib/task-spec/src/task-spec/concrete_arg_spec.cc @@ -1,4 +1,4 @@ -#include "task-spec/concrete_arg.h" +#include "task-spec/concrete_arg_spec.h" namespace FlexFlow { diff --git a/lib/task-spec/src/task-spec/forward_tensor_source.cc b/lib/task-spec/src/task-spec/forward_tensor_source.cc new file mode 100644 index 0000000000..3d82452377 --- /dev/null +++ b/lib/task-spec/src/task-spec/forward_tensor_source.cc @@ -0,0 +1,18 @@ +#include "task-spec/forward_tensor_source.h" + +namespace FlexFlow { + +int ForwardTensorSource::next_available_forward_tensor_id = 0; + +ForwardTensorSource::ForwardTensorSource() {} + +forward_tensor_guid_t ForwardTensorSource::new_forward_tensor() { + return forward_tensor_guid_t{ + ForwardTensorSource::next_available_forward_tensor_id++}; +} + +void ForwardTensorSource::reset() { + ForwardTensorSource::next_available_forward_tensor_id = 0; +} + +} // namespace FlexFlow diff --git a/lib/local-execution/src/gradient_tensor_source.cc b/lib/task-spec/src/task-spec/gradient_tensor_source.cc similarity index 55% rename from lib/local-execution/src/gradient_tensor_source.cc rename to lib/task-spec/src/task-spec/gradient_tensor_source.cc index 7dcb947e89..8bc5034634 100644 --- a/lib/local-execution/src/gradient_tensor_source.cc +++ b/lib/task-spec/src/task-spec/gradient_tensor_source.cc @@ -1,13 +1,13 @@ -#include "local-execution/gradient_tensor_source.h" +#include "task-spec/gradient_tensor_source.h" namespace FlexFlow { -size_t GradientTensorSource::next_available_gradient_tensor_id = 0; +int GradientTensorSource::next_available_gradient_tensor_id = 0; GradientTensorSource::GradientTensorSource() {} -gradient_tensor_t GradientTensorSource::new_gradient_tensor() { - return gradient_tensor_t{ +gradient_tensor_guid_t GradientTensorSource::new_gradient_tensor() { + return gradient_tensor_guid_t{ GradientTensorSource::next_available_gradient_tensor_id++}; } diff --git a/lib/local-execution/src/loss_functions.cc b/lib/task-spec/src/task-spec/loss_functions.cc similarity index 66% rename from lib/local-execution/src/loss_functions.cc rename to lib/task-spec/src/task-spec/loss_functions.cc index 4d0b32fd48..698ca941d3 100644 --- a/lib/local-execution/src/loss_functions.cc +++ b/lib/task-spec/src/task-spec/loss_functions.cc @@ -15,14 +15,13 @@ #include "op-attrs/ops/loss_functions.h" #include "kernels/loss_function_kernels.h" -#include "local-execution/loss_functions.h" -#include "kernels/format_accessor_contents.h" +#include "task-spec/loss_functions.h" #include "task-spec/profiling.h" #include "utils/nonnegative_int/nonnegative_int.h" namespace FlexFlow { -enum Slots { LOGIT, LABEL, LOGIT_GRAD, ATTRS, PROFILING }; +enum Slots { LOGIT, LABEL, LOGIT_GRAD, ATTRS, PROFILING, KERNEL_DEVICE_TYPE }; TaskSignature get_loss_bwd_signature() { TaskSignature sig = make_empty_task_signature(); @@ -32,13 +31,14 @@ TaskSignature get_loss_bwd_signature() { add_arg_slot(sig, ATTRS); add_arg_slot(sig, PROFILING); + add_arg_slot(sig, KERNEL_DEVICE_TYPE); return sig; } TaskInvocation backward(LossAttrs const &attrs, - tensor_guid_t logit, - gradient_tensor_t logit_grad, - loss_tensor_t label) { + forward_tensor_guid_t logit, + gradient_tensor_guid_t logit_grad, + loss_tensor_guid_t label) { TaskBinding b; b.bind(LOGIT, logit); b.bind_loss(LABEL, label); @@ -46,6 +46,7 @@ TaskInvocation backward(LossAttrs const &attrs, b.bind_arg(ATTRS, attrs); b.bind_arg(PROFILING, profiling_settings()); + b.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); return TaskInvocation{task_id_t::LOSS_BWD_TASK_ID, b}; } @@ -53,53 +54,63 @@ TaskInvocation backward(LossAttrs const &attrs, static void backward_task_impl(TaskArgumentAccessor const &acc) { auto attrs = acc.get_argument(ATTRS); auto profiling = acc.get_argument(PROFILING); + auto kernel_device_type = acc.get_argument(KERNEL_DEVICE_TYPE); auto logit_grad = acc.get_tensor_grad(LOGIT_GRAD); auto logit = acc.get_tensor(LOGIT); auto label = acc.get_loss_tensor(LABEL); - int batch_size = logit.shape.at(legion_dim_t{1_n}).int_from_positive_int(); + int batch_size = + dim_at_idx(logit.shape.dims, legion_dim_t{1_n}).int_from_positive_int(); // assuming logit shape is [batch dim, num classes] LossFunction loss_type = get_loss_function(attrs); float scale_factor = 1.0f / batch_size; if (loss_type == LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE) { - ASSERT(logit.shape.num_elements() == label.shape.num_elements()); - scale_factor = 2.0f / logit.shape.num_elements().int_from_positive_int(); + ASSERT(get_num_elements(logit.shape.dims) == + get_num_elements(label.shape.dims)); + scale_factor = + 2.0f / get_num_elements(logit.shape.dims).int_from_positive_int(); } if (loss_type == LossFunction::SPARSE_CATEGORICAL_CROSSENTROPY) { // label shape is [batch dim, 1] auto scce_attrs = attrs.get(); - size_t ndim = logit.shape.num_dims().unwrap_nonnegative(); - int num_classes = logit.shape.at(legion_dim_t{0_n}).int_from_positive_int(); + size_t ndim = get_num_dims(logit.shape.dims).unwrap_nonnegative(); + int num_classes = + dim_at_idx(logit.shape.dims, legion_dim_t{0_n}).int_from_positive_int(); ASSERT(logit_grad.shape == logit.shape); int k = 1; if (scce_attrs.replace_labels) { - k = logit.shape.at(legion_dim_t(nonnegative_int{ndim - 1})) + k = dim_at_idx(logit.shape.dims, legion_dim_t{nonnegative_int{ndim - 1}}) .int_from_positive_int() / - label.shape.at(legion_dim_t(nonnegative_int{ndim - 1})) + dim_at_idx(label.shape.dims, legion_dim_t{nonnegative_int{ndim - 1}}) .int_from_positive_int(); // TODO FIXME something seems wrong // here, isn't the numerator guaranteed // to be 1? // <--- this is not the case because of // the potential parallel dim } - ASSERT(label.shape.sub_shape(legion_dim_t(1_n), std::nullopt) == - logit.shape.sub_shape(legion_dim_t(1_n), std::nullopt)); - ASSERT(k * label.shape.at(legion_dim_t(nonnegative_int{ndim - 1})) + ASSERT(slice_tensor_dims( + label.shape.dims, relative_ff_dim_t{0}, relative_ff_dim_t{-2}) == + slice_tensor_dims( + logit.shape.dims, relative_ff_dim_t{0}, relative_ff_dim_t{-2})); + ASSERT(k * dim_at_idx(label.shape.dims, + legion_dim_t{nonnegative_int{ndim - 1}}) .int_from_positive_int() == - logit.shape.at(legion_dim_t(nonnegative_int{ndim - 1})) + dim_at_idx(logit.shape.dims, legion_dim_t{nonnegative_int{ndim - 1}}) .int_from_positive_int()); - ASSERT(label.shape.at(legion_dim_t(0_n)).int_from_positive_int() == 1); + ASSERT(dim_at_idx(label.shape.dims, legion_dim_t(0_n)) + .int_from_positive_int() == 1); profile(sparse_categorical_crossentropy_loss_backward_kernel, profiling, + kernel_device_type, "[SparseCategoricalCrossEntropyLoss] backward_time = %.2lfms\n", get_float_ptr(logit_grad), get_float_ptr(logit), reinterpret_cast(get_float_ptr(label)), - get_num_elements(logit.shape).int_from_positive_int(), - get_num_elements(logit_grad.shape).int_from_positive_int(), + get_num_elements(logit.shape.dims).int_from_positive_int(), + get_num_elements(logit_grad.shape.dims).int_from_positive_int(), batch_size, num_classes, k, @@ -108,46 +119,41 @@ static void backward_task_impl(TaskArgumentAccessor const &acc) { ASSERT(logit.shape == label.shape); ASSERT(logit_grad.shape == logit.shape); int num_channels = - logit.shape.at(legion_dim_t{0_n}).int_from_positive_int(); + dim_at_idx(logit.shape.dims, legion_dim_t{0_n}).int_from_positive_int(); switch (loss_type) { case LossFunction::CATEGORICAL_CROSSENTROPY: { - size_t logit_volume = get_num_elements(logit.shape).int_from_positive_int(); - size_t logit_grad_volume = - get_num_elements(logit_grad.shape).int_from_positive_int(); - profile(categorical_crossentropy_loss_backward_kernel, profiling, + kernel_device_type, "[CategoricalCrossEntropyLoss] backward_time = %.2lfms\n", - get_float_ptr(logit_grad), - get_float_ptr(logit), - get_float_ptr(label), - logit_volume, - logit_grad_volume, + logit_grad, + logit, + label, scale_factor); - - break; } case LossFunction::MEAN_SQUARED_ERROR_AVG_REDUCE: { profile(mean_squared_error_avg_loss_backward_kernel, profiling, + kernel_device_type, "[MeanSquaredErrorAvgLoss] backward_time = %.2lfms\n", get_float_ptr(logit_grad), get_float_ptr(logit), get_float_ptr(label), - get_num_elements(logit.shape).int_from_positive_int(), - get_num_elements(logit_grad.shape).int_from_positive_int(), + get_num_elements(logit.shape.dims).int_from_positive_int(), + get_num_elements(logit_grad.shape.dims).int_from_positive_int(), scale_factor); break; } case LossFunction::IDENTITY: { profile(identity_loss_backward_kernel, profiling, + kernel_device_type, "[IdentityLoss] backward_time = %.2lfms\n", get_float_ptr(logit_grad), get_float_ptr(logit), - get_num_elements(logit.shape).int_from_positive_int(), - get_num_elements(logit_grad.shape).int_from_positive_int(), + get_num_elements(logit.shape.dims).int_from_positive_int(), + get_num_elements(logit_grad.shape.dims).int_from_positive_int(), scale_factor); break; } diff --git a/lib/task-spec/src/task-spec/loss_tensor_source.cc b/lib/task-spec/src/task-spec/loss_tensor_source.cc new file mode 100644 index 0000000000..13b97fd604 --- /dev/null +++ b/lib/task-spec/src/task-spec/loss_tensor_source.cc @@ -0,0 +1,13 @@ +#include "task-spec/loss_tensor_source.h" + +namespace FlexFlow { + +nonnegative_int LossTensorSource::next_available_loss_tensor_id = 0_n; + +LossTensorSource::LossTensorSource() {} + +loss_tensor_guid_t LossTensorSource::new_loss_tensor() { + return loss_tensor_guid_t{LossTensorSource::next_available_loss_tensor_id++}; +} + +} // namespace FlexFlow diff --git a/lib/task-spec/src/task-spec/op_arg_ref.cc b/lib/task-spec/src/task-spec/op_arg_ref.cc index a427117982..29c895f1c8 100644 --- a/lib/task-spec/src/task-spec/op_arg_ref.cc +++ b/lib/task-spec/src/task-spec/op_arg_ref.cc @@ -2,8 +2,31 @@ namespace FlexFlow { -OpArgRef input_parallel_tensor_shape(int idx) { - OpArgRefType arg_ref_type = OpArgRefType{ParallelTensorShapeRefType{idx}}; +OpArgRef input_parallel_tensor_shape(nonnegative_int idx) { + OpArgRefType arg_ref_type = OpArgRefType{ParallelTensorShapeRefType{ + /*tensor_role=*/TensorRole::INPUT, + /*idx=*/idx, + }}; + ArgRef arg_ref = {arg_ref_type}; + return arg_ref; +} + +OpArgRef + weight_parallel_tensor_shape(nonnegative_int idx) { + OpArgRefType arg_ref_type = OpArgRefType{ParallelTensorShapeRefType{ + /*tensor_role=*/TensorRole::WEIGHT, + /*idx=*/idx, + }}; + ArgRef arg_ref = {arg_ref_type}; + return arg_ref; +} + +OpArgRef + output_parallel_tensor_shape(nonnegative_int idx) { + OpArgRefType arg_ref_type = OpArgRefType{ParallelTensorShapeRefType{ + /*tensor_role=*/TensorRole::OUTPUT, + /*idx=*/idx, + }}; ArgRef arg_ref = {arg_ref_type}; return arg_ref; } diff --git a/lib/task-spec/src/task-spec/op_task_to_task_invocation.cc b/lib/task-spec/src/task-spec/op_task_to_task_invocation.cc index 515d1dc1dc..b33edc9a76 100644 --- a/lib/task-spec/src/task-spec/op_task_to_task_invocation.cc +++ b/lib/task-spec/src/task-spec/op_task_to_task_invocation.cc @@ -1,105 +1,161 @@ #include "task-spec/op_task_to_task_invocation.h" #include "op-attrs/parallel_tensor_shape.h" +#include "pcg/cg_operator_tensor_shape_signature.h" #include "pcg/computation_graph.h" +#include "task-spec/slot_grad_id.dtg.h" +#include "task-spec/training_layer_plus_context.h" +#include "task-spec/training_layer_tensor_group_signature.h" +#include "utils/containers/map_values.h" +#include "utils/containers/transform.h" +#include "utils/overload.h" namespace FlexFlow { -TaskInvocation lower_to_task_invocation( - OpTaskInvocation const &op_task_invocation, - layer_guid_t const &layer_guid, - std::vector const &input_tensors, - std::vector const &input_tensor_shapes, - std::vector const &output_tensors, - std::vector const &weight_tensors, - std::unordered_map const - &tensor_gradient_mapping, - std::optional const &device_states) { - TaskBinding binding; +TaskInvocation + lower_to_task_invocation(OpTaskInvocation const &op_task_invocation, + TrainingLayerPlusContext const &training_layer, + std::optional const + &device_specific_device_states) { - for (auto const &tensor_binding : - op_task_invocation.binding.get_tensor_bindings()) { - tensor_guid_t tensor_to_bind = [&] { - OpTensorSpec tensor_binding_spec = tensor_binding.second; - switch (tensor_binding_spec.role) { - case TensorRole::INPUT: - return input_tensors.at(tensor_binding_spec.idx); - case TensorRole::OUTPUT: - return output_tensors.at(tensor_binding_spec.idx); - case TensorRole::WEIGHT: - return weight_tensors.at(tensor_binding_spec.idx); - default: - throw mk_runtime_error( - fmt::format("Invalid tensor role {}", tensor_binding_spec.role)); - } - }(); + std::unordered_map + tensor_bindings = + transform(op_task_invocation.binding.get_tensor_bindings(), + [&](SlotGradId const &slot_grad_id, + OpTensorSpec const &op_tensor_spec) { + return lower_tensor_binding( + get_tensor_group_signature(training_layer), + slot_grad_id, + op_tensor_spec); + }); - SlotGradId slot_grad_id = tensor_binding.first; + std::unordered_map arg_bindings = map_values( + op_task_invocation.binding.get_arg_bindings(), + [&](OpArgSpec const &op_arg_spec) { + return lower_to_task_arg_spec(op_arg_spec, + get_cg_op_shape_signature(training_layer), + training_layer.layer_guid, + device_specific_device_states); + }); - if (slot_grad_id.is_grad == IsGrad::NO) { - binding.bind(slot_grad_id.slot_id, tensor_to_bind); - } else if (slot_grad_id.is_grad == IsGrad::YES) { - binding.bind_grad(slot_grad_id.slot_id, - tensor_gradient_mapping.at(tensor_to_bind)); - } else { - throw mk_runtime_error(fmt::format("Invalid value for IsGrad {}", - tensor_binding.first.is_grad)); - } - } + return TaskInvocation{ + op_task_invocation.task_id, + TaskBinding{ + tensor_bindings, + arg_bindings, + }, + }; +} - // args - for (auto const &arg_binding : - op_task_invocation.binding.get_arg_bindings()) { - if (arg_binding.second.has()) { - ConcreteArgSpec concrete_arg = - lower_to_concrete_arg_spec(arg_binding.second.get(), - input_tensor_shapes, - layer_guid, - device_states); - binding.insert_arg_spec(arg_binding.first, TaskArgSpec{concrete_arg}); - } else if (arg_binding.second.has()) { - binding.insert_arg_spec( - arg_binding.first, - TaskArgSpec{arg_binding.second.get()}); - } else { - binding.insert_arg_spec( - arg_binding.first, - TaskArgSpec{arg_binding.second.get()}); - } +std::pair + lower_tensor_binding(TrainingLayerTensorGroupSignature const &signature, + SlotGradId const &slot_grad_id, + OpTensorSpec const &op_tensor_spec) { + auto [tensor_to_bind, gradient_tensor_guid_to_bind] = [&] { + TrainingTensorGroup group = get_training_tensor_group_for_role_and_index( + signature, op_tensor_spec.role, op_tensor_spec.idx); + + return std::pair{ + group.forward_tensor, + group.gradient_tensor, + }; + }(); + + if (slot_grad_id.is_grad == IsGrad::NO) { + return std::pair{ + tensor_sub_slot_id_t{ + slot_grad_id.slot_id, + TensorType::FORWARD, + }, + training_tensor_guid_t{ + tensor_to_bind, + }, + }; + } else if (slot_grad_id.is_grad == IsGrad::YES) { + return std::pair{ + tensor_sub_slot_id_t{ + slot_grad_id.slot_id, + TensorType::GRADIENT, + }, + training_tensor_guid_t{ + gradient_tensor_guid_to_bind, + }, + }; + } else { + PANIC("Invalid value for IsGrad {}", slot_grad_id.is_grad); } +} - return TaskInvocation{op_task_invocation.task_id, binding}; +TaskArgSpec lower_to_task_arg_spec( + OpArgSpec const &op_arg_spec, + CGOperatorTensorShapeSignature const &op_shape_signature, + layer_guid_t const &layer_guid, + std::optional const + &device_specific_device_states) { + return op_arg_spec.visit(overload{ + [](ConcreteArgSpec const &concrete_arg_spec) { + return TaskArgSpec{concrete_arg_spec}; + }, + [](RuntimeArgRefSpec const &runtime_arg_ref_spec) { + return TaskArgSpec{runtime_arg_ref_spec}; + }, + [&](OpArgRefSpec const &op_arg_ref_spec) { + return TaskArgSpec{ + lower_to_concrete_arg_spec(op_arg_ref_spec, + op_shape_signature, + layer_guid, + device_specific_device_states), + }; + }, + }); } ConcreteArgSpec lower_to_concrete_arg_spec( OpArgRefSpec const &op_arg_ref_spec, - std::vector const &input_tensor_shapes, + CGOperatorTensorShapeSignature const &op_signature, layer_guid_t const &op_guid, std::optional const &device_states) { - if (op_arg_ref_spec.holds()) { - PerDeviceOpState device_state = - get_device_state_from_device_specific(device_states.value(), 0); - return ConcreteArgSpec::create(device_state); - } else if (op_arg_ref_spec.holds()) { - ParallelTensorShapeRefType index_op_arg_ref = - op_arg_ref_spec.get_ref_type().get(); - TensorShape input_tensor_shape = - input_tensor_shapes.at(index_op_arg_ref.idx); - ParallelTensorShape shape = lift_to_parallel(input_tensor_shape); - return ConcreteArgSpec::create(shape); - } else { - throw mk_runtime_error("Unhandled op arg ref type"); - } + + OpArgRefType op_arg_ref_type = op_arg_ref_spec.get_ref_type(); + return op_arg_ref_type.visit(overload{ + [&](PerDeviceOpStateRefType const &) { + PerDeviceOpState per_device_op_state = + get_device_state_from_device_specific(device_states.value(), 0); + + return per_device_op_state.visit(overload{ + [&](auto const &x) { + ASSERT(matches(op_arg_ref_spec.get_type_index())); + return ConcreteArgSpec::create(x); + }, + }); + }, + [&](ParallelTensorShapeRefType const &ref_type) { + TensorShape tensor_shape = tensor_shape_for_role_and_index( + /*signature=*/op_signature, + /*tensor_role=*/ref_type.tensor_role, + /*index=*/ref_type.idx); + ParallelTensorShape shape = lift_to_parallel(tensor_shape); + return ConcreteArgSpec::create(shape); + }, + }); } ConcreteArgSpec lower_to_concrete_arg_spec(RuntimeArgRefSpec const &runtime_arg_ref_spec, RuntimeArgConfig const &runtime_arg_config) { - if (runtime_arg_ref_spec.holds>()) { - return ConcreteArgSpec::create(*(runtime_arg_config.ff_handle.get(0))); - } else if (runtime_arg_ref_spec.holds()) { - return ConcreteArgSpec::create(runtime_arg_config.profiling_settings); - } else { - throw mk_runtime_error("Unhandled runtime arg ref type"); + switch (runtime_arg_ref_spec.get_ref_type()) { + case RuntimeArgRefType::FF_HANDLE: + return ConcreteArgSpec::create(*(runtime_arg_config.ff_handle.get(0))); + case RuntimeArgRefType::PROFILING_SETTINGS: + return ConcreteArgSpec::create(runtime_arg_config.profiling_settings); + case RuntimeArgRefType::FF_ITERATION_CONFIG: + PANIC("FF_ITERATION_CONFIG is currently not handled. Please create an " + "issue or contact the FlexFlow train developers if you need this " + "feature."); + case RuntimeArgRefType::KERNEL_DEVICE_TYPE: + return ConcreteArgSpec::create(runtime_arg_config.kernel_device_type); + default: + PANIC(fmt::format("Unhandled RuntimeArgRefType {}", + runtime_arg_ref_spec.get_ref_type())); } } diff --git a/lib/task-spec/src/task-spec/op_tensor_spec.cc b/lib/task-spec/src/task-spec/op_tensor_spec.cc index 1d97e6ae16..ed312e47af 100644 --- a/lib/task-spec/src/task-spec/op_tensor_spec.cc +++ b/lib/task-spec/src/task-spec/op_tensor_spec.cc @@ -2,16 +2,16 @@ namespace FlexFlow { -OpTensorSpec input_tensor(int idx, OpSlotOptions option) { - return {TensorRole::INPUT, option, idx}; +OpTensorSpec input_tensor(nonnegative_int idx, OpSlotOptions option) { + return OpTensorSpec{TensorRole::INPUT, option, idx}; } -OpTensorSpec output_tensor(int idx, OpSlotOptions option) { - return {TensorRole::OUTPUT, option, idx}; +OpTensorSpec output_tensor(nonnegative_int idx, OpSlotOptions option) { + return OpTensorSpec{TensorRole::OUTPUT, option, idx}; } -OpTensorSpec weight_tensor(int idx, OpSlotOptions option) { - return {TensorRole::WEIGHT, option, idx}; +OpTensorSpec weight_tensor(nonnegative_int idx, OpSlotOptions option) { + return OpTensorSpec{TensorRole::WEIGHT, option, idx}; } } // namespace FlexFlow diff --git a/lib/task-spec/src/task-spec/ops/attention.cc b/lib/task-spec/src/task-spec/ops/attention.cc index 488517a02e..ea2282792a 100644 --- a/lib/task-spec/src/task-spec/ops/attention.cc +++ b/lib/task-spec/src/task-spec/ops/attention.cc @@ -15,9 +15,11 @@ #include "task-spec/ops/attention.h" #include "kernels/attention_kernels.h" +#include "kernels/device_handle_t.dtg.h" #include "op-attrs/ops/attention.h" #include "op-attrs/ops/attention/multihead_attention_parallel_inputs.h" #include "task-spec/op_task_signature.h" +#include "task-spec/profiling.h" namespace FlexFlow { @@ -39,7 +41,8 @@ enum Slots { WEIGHTS, OUTPUT, HANDLE, - PER_DEVICE_STATE + PER_DEVICE_STATE, + KERNEL_DEVICE_TYPE, }; OpTaskInvocation init(MultiHeadAttentionAttrs const &attrs) { @@ -48,49 +51,66 @@ OpTaskInvocation init(MultiHeadAttentionAttrs const &attrs) { b.bind_arg(HANDLE, ff_handle()); b.bind_arg(ATTRS, attrs); - b.bind_arg(QUERY_PARALLEL_TENSOR_SHAPE, input_parallel_tensor_shape(0)); - b.bind_arg(KEY_PARALLEL_TENSOR_SHAPE, input_parallel_tensor_shape(1)); - b.bind_arg(VALUE_PARALLEL_TENSOR_SHAPE, input_parallel_tensor_shape(2)); + b.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); + + b.bind_arg(QUERY_PARALLEL_TENSOR_SHAPE, input_parallel_tensor_shape(0_n)); + b.bind_arg(KEY_PARALLEL_TENSOR_SHAPE, input_parallel_tensor_shape(1_n)); + b.bind_arg(VALUE_PARALLEL_TENSOR_SHAPE, input_parallel_tensor_shape(2_n)); b.bind_arg(QPROJSIZE, get_qProjSize(attrs)); b.bind_arg(KPROJSIZE, get_kProjSize(attrs)); b.bind_arg(VPROJSIZE, get_vProjSize(attrs)); b.bind_arg(OPROJSIZE, get_oProjSize(attrs)); - return {task_id_t::ATTENTION_INIT_TASK_ID, b}; + return OpTaskInvocation{ + task_id_t::ATTENTION_INIT_TASK_ID, + b, + }; } OpTaskInvocation forward(MultiHeadAttentionAttrs const &attrs) { OpTaskBinding b; - b.bind(QUERY, input_tensor(0)); - b.bind(KEY, input_tensor(1)); - b.bind(VALUE, input_tensor(2)); - b.bind(WEIGHTS, weight_tensor(0)); - b.bind(OUTPUT, output_tensor(0)); + b.bind(QUERY, input_tensor(0_n)); + b.bind(KEY, input_tensor(1_n)); + b.bind(VALUE, input_tensor(2_n)); + b.bind(WEIGHTS, weight_tensor(0_n)); + b.bind(OUTPUT, output_tensor(0_n)); + b.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); b.bind_arg(PROFILING, profiling_settings()); - b.bind_arg(PER_DEVICE_STATE, per_device_op_state()); + b.bind_arg(PER_DEVICE_STATE, + per_device_op_state>()); - return {task_id_t::ATTENTION_FWD_TASK_ID, b}; + return OpTaskInvocation{ + task_id_t::ATTENTION_FWD_TASK_ID, + b, + }; } OpTaskInvocation backward(MultiHeadAttentionAttrs const &attrs) { OpTaskBinding b = infer_bwd_binding(forward(attrs).binding); - return {task_id_t::ATTENTION_BWD_TASK_ID, b}; + return OpTaskInvocation{ + task_id_t::ATTENTION_BWD_TASK_ID, + b, + }; } static DeviceSpecificDeviceStates init_task_impl(TaskArgumentAccessor const &acc) { auto const &attrs = acc.get_argument(ATTRS); Allocator allocator = acc.get_allocator(); + + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); + positive_int qProjSize = acc.get_argument(QPROJSIZE); positive_int kProjSize = acc.get_argument(KPROJSIZE); positive_int vProjSize = acc.get_argument(VPROJSIZE); positive_int oProjSize = acc.get_argument(OPROJSIZE); - PerDeviceFFHandle handle = acc.get_argument(HANDLE); + device_handle_t handle = acc.get_argument(HANDLE); ParallelTensorShape query_parallel_tensor_shape = acc.get_argument(QUERY_PARALLEL_TENSOR_SHAPE); ParallelTensorShape key_parallel_tensor_shape = @@ -117,23 +137,27 @@ static DeviceSpecificDeviceStates positive_int num_samples = get_num_samples(parsed); positive_int num_heads = attrs.num_heads; - MHAPerDeviceState per_device_state = - init_kernel(handle, - allocator, - num_samples.int_from_positive_int(), - num_heads.int_from_positive_int(), - qSize.int_from_positive_int(), - kSize.int_from_positive_int(), - vSize.int_from_positive_int(), - qProjSize.int_from_positive_int(), - kProjSize.int_from_positive_int(), - vProjSize.int_from_positive_int(), - oProjSize.int_from_positive_int(), - qoSeqLength.int_from_positive_int(), - kvSeqLength.int_from_positive_int(), - attrs.add_bias_kv); + std::optional per_device_state = init_kernel( + /*device_type=*/kernel_device_type, + /*per_device_ff_handle=*/handle, + /*allocator=*/allocator, + /*num_samples=*/num_samples.int_from_positive_int(), + /*num_heads=*/num_heads.int_from_positive_int(), + /*qSize=*/qSize.int_from_positive_int(), + /*kSize=*/kSize.int_from_positive_int(), + /*vSize=*/vSize.int_from_positive_int(), + /*qProjSize=*/qProjSize.int_from_positive_int(), + /*kProjSize=*/kProjSize.int_from_positive_int(), + /*vProjSize=*/vProjSize.int_from_positive_int(), + /*oProjSize=*/oProjSize.int_from_positive_int(), + /*qoSeqLength=*/qoSeqLength.int_from_positive_int(), + /*kvSeqLength=*/kvSeqLength.int_from_positive_int(), + /*add_bias_kv=*/attrs.add_bias_kv); + return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; + DeviceSpecific>::create( + per_device_state), + }; } static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { @@ -144,11 +168,14 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto output = acc.get_tensor(OUTPUT); ProfilingSettings profiling = acc.get_argument(PROFILING); - MHAPerDeviceState per_device_state = - acc.get_argument(PER_DEVICE_STATE); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); + std::optional per_device_state = + acc.get_argument>(PER_DEVICE_STATE); return profile(forward_kernel, profiling, + kernel_device_type, "[MultiHeadAttention] forward_time = {:.2lf}ms\n", per_device_state, query.get_float_ptr(), @@ -171,9 +198,11 @@ static std::optional auto key_grad = acc.get_tensor_grad(KEY); auto value_grad = acc.get_tensor_grad(VALUE); - MHAPerDeviceState per_device_state = - acc.get_argument(PER_DEVICE_STATE); + std::optional per_device_state = + acc.get_argument>(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); float *key_grad_ptr = (key_grad == query_grad) ? nullptr : key_grad.get_float_ptr(); @@ -181,14 +210,15 @@ static std::optional ? nullptr : value_grad.get_float_ptr(); - assert(value_grad.shape == value.shape); - assert(key_grad.shape == key.shape); + ASSERT(value_grad.shape == value.shape); + ASSERT(key_grad.shape == key.shape); - assert(query_grad.shape == query.shape); - assert(weight_grad.shape.num_elements() == weight.shape.num_elements()); + ASSERT(query_grad.shape == query.shape); + ASSERT(weight_grad.shape == weight.shape); return profile(backward_kernel, profiling, + kernel_device_type, "[MultiHeadAttention] backward_time = {:.2lf}ms\n", per_device_state, query.get_float_ptr(), @@ -224,7 +254,7 @@ OpTaskSignature get_attention_init_signature() { init.add_arg_slot(ATTRS); init.add_unchecked_arg_slot(HANDLE); - init.add_return_value(); + init.add_return_value>(); return init; } @@ -239,7 +269,8 @@ OpTaskSignature get_attention_fwd_signature() { fwd.add_output_slot(OUTPUT); fwd.add_arg_slot(PROFILING); - fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); + fwd.add_unchecked_arg_slot>( + PER_DEVICE_STATE); return fwd; } diff --git a/lib/task-spec/src/task-spec/ops/batch_matmul.cc b/lib/task-spec/src/task-spec/ops/batch_matmul.cc index 1ee9da82d3..f8d6955b41 100644 --- a/lib/task-spec/src/task-spec/ops/batch_matmul.cc +++ b/lib/task-spec/src/task-spec/ops/batch_matmul.cc @@ -17,6 +17,7 @@ #include "kernels/batch_matmul_kernels.h" #include "op-attrs/ops/batch_matmul.h" #include "task-spec/op_task_signature.h" +#include "task-spec/profiling.h" #include "utils/containers/transform.h" #include "utils/nonnegative_int/nonnegative_range.h" @@ -31,28 +32,36 @@ enum Slots { OUTPUT, // tensor PROFILING, HANDLE, - ITERATION_CONFIG + ITERATION_CONFIG, + KERNEL_DEVICE_TYPE, }; OpTaskInvocation forward(BatchMatmulAttrs const &attrs) { OpTaskBinding fwd; - fwd.bind(A_INPUT, input_tensor(0)); - fwd.bind(B_INPUT, input_tensor(1)); - fwd.bind(OUTPUT, output_tensor(0)); + fwd.bind(A_INPUT, input_tensor(0_n)); + fwd.bind(B_INPUT, input_tensor(1_n)); + fwd.bind(OUTPUT, output_tensor(0_n)); fwd.bind_arg(ATTRS, attrs); fwd.bind_arg(HANDLE, ff_handle()); fwd.bind_arg(PROFILING, profiling_settings()); fwd.bind_arg(ITERATION_CONFIG, iteration_config()); + fwd.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); - return {task_id_t::BATCHMATMUL_FWD_TASK_ID, fwd}; + return OpTaskInvocation{ + task_id_t::BATCHMATMUL_FWD_TASK_ID, + fwd, + }; } OpTaskInvocation backward(BatchMatmulAttrs const &attrs) { OpTaskBinding bwd = infer_bwd_binding(forward(attrs).binding); - return {task_id_t::BATCHMATMUL_BWD_TASK_ID, bwd}; + return OpTaskInvocation{ + task_id_t::BATCHMATMUL_BWD_TASK_ID, + bwd, + }; } static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { @@ -60,27 +69,32 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto b_input = acc.get_tensor(B_INPUT); auto output = acc.get_tensor(OUTPUT); auto attrs = acc.get_argument(ATTRS); - PerDeviceFFHandle handle = acc.get_argument(HANDLE); + device_handle_t handle = acc.get_argument(HANDLE); ProfilingSettings profiling = acc.get_argument(PROFILING); FFIterationConfig iter_config = acc.get_argument(ITERATION_CONFIG); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); - positive_int m = b_input.shape.at(legion_dim_t{0_n}); - ASSERT(m == output.shape.at(legion_dim_t{0_n})); - positive_int n = a_input.shape.at(legion_dim_t{1_n}); - ASSERT(n == output.shape.at(legion_dim_t{1_n})); - positive_int k = a_input.shape.at(legion_dim_t{0_n}); - ASSERT(k == b_input.shape.at(legion_dim_t{1_n})); + positive_int m = dim_at_idx(b_input.shape.dims, legion_dim_t{0_n}); + ASSERT(m == dim_at_idx(output.shape.dims, legion_dim_t{0_n})); + positive_int n = dim_at_idx(a_input.shape.dims, legion_dim_t{1_n}); + ASSERT(n == dim_at_idx(output.shape.dims, legion_dim_t{1_n})); + positive_int k = dim_at_idx(a_input.shape.dims, legion_dim_t{0_n}); + ASSERT(k == dim_at_idx(b_input.shape.dims, legion_dim_t{1_n})); - ASSERT(a_input.shape.num_elements() == b_input.shape.num_elements()); - ASSERT(a_input.shape.num_elements() == output.shape.num_elements()); + ASSERT(get_num_elements(a_input.shape.dims) == + get_num_elements(b_input.shape.dims)); + ASSERT(get_num_elements(a_input.shape.dims) == + get_num_elements(output.shape.dims)); positive_int batch = 1_p; - for (nonnegative_int i : nonnegative_range(2_n, a_input.shape.num_dims())) { - positive_int dim_size = a_input.shape.at(legion_dim_t{i}); - ASSERT(dim_size == b_input.shape.at(legion_dim_t{i})); - ASSERT(dim_size == output.shape.at(legion_dim_t{i})); + for (nonnegative_int i : + nonnegative_range(2_n, get_num_dims(a_input.shape.dims))) { + positive_int dim_size = dim_at_idx(a_input.shape.dims, legion_dim_t{i}); + ASSERT(dim_size == dim_at_idx(b_input.shape.dims, legion_dim_t{i})); + ASSERT(dim_size == dim_at_idx(output.shape.dims, legion_dim_t{i})); batch *= dim_size; } @@ -92,6 +106,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, + kernel_device_type, "[BatchMatmul] forward_time = {:.2lf}ms\n", handle, output.get_float_ptr(), @@ -112,7 +127,9 @@ static std::optional FFIterationConfig iter_config = acc.get_argument(ITERATION_CONFIG); ProfilingSettings profiling = acc.get_argument(PROFILING); - PerDeviceFFHandle handle = acc.get_argument(HANDLE); + device_handle_t handle = acc.get_argument(HANDLE); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto output = acc.get_tensor(OUTPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); @@ -127,25 +144,29 @@ static std::optional ASSERT(b_input.shape == b_input_grad.shape); // check dins - positive_int m = b_input.shape.at(legion_dim_t{0_n}); - ASSERT(m == output.shape.at(legion_dim_t{0_n})); - positive_int n = a_input.shape.at(legion_dim_t{1_n}); - ASSERT(n == output.shape.at(legion_dim_t{1_n})); - positive_int k = a_input.shape.at(legion_dim_t{0_n}); - ASSERT(k == b_input.shape.at(legion_dim_t{1_n})); - ASSERT(a_input.shape.num_elements() == b_input.shape.num_elements()); - ASSERT(a_input.shape.num_elements() == output.shape.num_elements()); + positive_int m = dim_at_idx(b_input.shape.dims, legion_dim_t{0_n}); + ASSERT(m == dim_at_idx(output.shape.dims, legion_dim_t{0_n})); + positive_int n = dim_at_idx(a_input.shape.dims, legion_dim_t{1_n}); + ASSERT(n == dim_at_idx(output.shape.dims, legion_dim_t{1_n})); + positive_int k = dim_at_idx(a_input.shape.dims, legion_dim_t{0_n}); + ASSERT(k == dim_at_idx(b_input.shape.dims, legion_dim_t{1_n})); + ASSERT(get_num_elements(a_input.shape.dims) == + get_num_elements(b_input.shape.dims)); + ASSERT(get_num_elements(a_input.shape.dims) == + get_num_elements(output.shape.dims)); positive_int batch = 1_p; - for (nonnegative_int i : nonnegative_range(2_n, a_input.shape.num_dims())) { - positive_int dim_size = a_input.shape.at(legion_dim_t{i}); - ASSERT(dim_size == b_input.shape.at(legion_dim_t{i})); - ASSERT(dim_size == output.shape.at(legion_dim_t{i})); + for (nonnegative_int i : + nonnegative_range(2_n, get_num_dims(a_input.shape.dims))) { + positive_int dim_size = dim_at_idx(a_input.shape.dims, legion_dim_t{i}); + ASSERT(dim_size == dim_at_idx(b_input.shape.dims, legion_dim_t{i})); + ASSERT(dim_size == dim_at_idx(output.shape.dims, legion_dim_t{i})); batch *= dim_size; } return profile(backward_kernel, profiling, + kernel_device_type, "[BatchMatmul] backward_time = {:.2lf}ms\n", handle, output.get_float_ptr(), @@ -175,7 +196,8 @@ OpTaskSignature get_batch_matmul_fwd_signature() { fwd.add_output_slot(OUTPUT); fwd.add_arg_slot(ATTRS); fwd.add_arg_slot(PROFILING); - fwd.add_unchecked_arg_slot(HANDLE); + fwd.add_arg_slot(KERNEL_DEVICE_TYPE); + fwd.add_unchecked_arg_slot(HANDLE); return fwd; } diff --git a/lib/task-spec/src/task-spec/ops/batch_norm.cc b/lib/task-spec/src/task-spec/ops/batch_norm.cc index 67c5a7d8a2..0599eec3f5 100644 --- a/lib/task-spec/src/task-spec/ops/batch_norm.cc +++ b/lib/task-spec/src/task-spec/ops/batch_norm.cc @@ -15,91 +15,112 @@ #include "task-spec/ops/batch_norm.h" #include "kernels/batch_norm_kernels.h" +#include "task-spec/profiling.h" namespace FlexFlow { using namespace FlexFlow::Kernels::BatchNorm; enum Slots { - INPUT, // tensor - SCALE, // tensor - BIAS, // tensor - OUTPUT, // tensor + INPUT, + SCALE, + BIAS, + OUTPUT, ATTRS, PROFILING, PER_DEVICE_STATE, RELU, - HANDLE + HANDLE, + KERNEL_DEVICE_TYPE, }; OpTaskInvocation init(BatchNormAttrs const &attrs) { OpTaskBinding binding; - binding.bind(INPUT, input_tensor(0)); - binding.bind(BIAS, input_tensor(2)); - binding.bind(OUTPUT, output_tensor(0)); + binding.bind(INPUT, input_tensor(0_n)); + binding.bind(BIAS, weight_tensor(1_n)); + binding.bind(OUTPUT, output_tensor(0_n)); binding.bind_arg(ATTRS, attrs); binding.bind_arg(PROFILING, profiling_settings()); binding.bind_arg(HANDLE, ff_handle()); + binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); - return {task_id_t::BATCHNORM_INIT_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::BATCHNORM_INIT_TASK_ID, + binding, + }; } OpTaskInvocation forward(BatchNormAttrs const &attrs) { OpTaskBinding binding; binding.bind_arg(PROFILING, profiling_settings()); - binding.bind_arg(PER_DEVICE_STATE, - per_device_op_state()); + binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); + binding.bind_arg( + PER_DEVICE_STATE, + per_device_op_state>()); - binding.bind(INPUT, input_tensor(0)); - binding.bind(SCALE, input_tensor(1)); - binding.bind(BIAS, input_tensor(2)); - binding.bind(OUTPUT, output_tensor(0)); + binding.bind(INPUT, input_tensor(0_n)); + binding.bind(SCALE, weight_tensor(0_n)); + binding.bind(BIAS, weight_tensor(1_n)); + binding.bind(OUTPUT, output_tensor(0_n)); - return {task_id_t::BATCHNORM_FWD_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::BATCHNORM_FWD_TASK_ID, + binding, + }; } OpTaskInvocation backward(BatchNormAttrs const &attrs) { OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - return {task_id_t::BATCHNORM_BWD_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::BATCHNORM_BWD_TASK_ID, + binding, + }; } static DeviceSpecificDeviceStates init_task_impl(TaskArgumentAccessor const &acc) { Allocator allocator = acc.get_allocator(); - PerDeviceFFHandle handle = acc.get_argument(HANDLE); + device_handle_t handle = acc.get_argument(HANDLE); ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto output = acc.get_tensor(OUTPUT); auto const &attrs = acc.get_argument(ATTRS); - positive_int output_w = output.shape.at(legion_dim_t{0_n}); - positive_int output_h = output.shape.at(legion_dim_t{1_n}); - positive_int output_c = output.shape.at(legion_dim_t{2_n}); - positive_int output_n = output.shape.at(legion_dim_t{3_n}); + positive_int output_w = dim_at_idx(output.shape.dims, legion_dim_t{0_n}); + positive_int output_h = dim_at_idx(output.shape.dims, legion_dim_t{1_n}); + positive_int output_c = dim_at_idx(output.shape.dims, legion_dim_t{2_n}); + positive_int output_n = dim_at_idx(output.shape.dims, legion_dim_t{3_n}); float *runningMean; - BatchNormPerDeviceState per_device_state = - init_kernel(handle, - allocator, - runningMean, - output_n.int_from_positive_int(), - output_c.int_from_positive_int(), - output_h.int_from_positive_int(), - output_w.int_from_positive_int(), - attrs.relu); + std::optional per_device_state = init_kernel( + /*device_type=*/kernel_device_type, + /*handle=*/handle, + /*allocator=*/allocator, + /*runningMean=*/runningMean, + /*output_n=*/output_n.int_from_positive_int(), + /*output_c=*/output_c.int_from_positive_int(), + /*output_h=*/output_h.int_from_positive_int(), + /*output_w=*/output_w.int_from_positive_int(), + /*relu=*/attrs.relu); return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; + DeviceSpecific>::create( + per_device_state), + }; } static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); @@ -108,6 +129,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, + kernel_device_type, "[BatchNorm] forward_time = {:.2lf}ms\n", per_device_state, input.get_float_ptr(), @@ -121,6 +143,8 @@ static std::optional auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto input = acc.get_tensor(INPUT); auto input_grad = acc.get_tensor_grad(INPUT); @@ -132,6 +156,7 @@ static std::optional return profile(backward_kernel, profiling, + kernel_device_type, "[BatchNorm] backward_time = {:.2lf}ms\n", per_device_state, output.get_float_ptr(), @@ -141,7 +166,7 @@ static std::optional scale.get_float_ptr(), scale_grad.get_float_ptr(), bias_grad.get_float_ptr(), - output.shape.num_elements().int_from_positive_int()); + get_num_elements(output.shape.dims).int_from_positive_int()); } TaskImplFunction get_batch_norm_init_task_impl() { @@ -162,7 +187,7 @@ OpTaskSignature get_batch_norm_init_signature() { init.add_output_slot(OUTPUT); init.add_arg_slot(ATTRS); init.add_arg_slot(PROFILING); - init.add_unchecked_arg_slot(HANDLE); + init.add_unchecked_arg_slot(HANDLE); return init; } @@ -175,7 +200,9 @@ OpTaskSignature get_batch_norm_fwd_signature() { fwd.add_input_slot(BIAS); fwd.add_output_slot(OUTPUT); fwd.add_arg_slot(PROFILING); - fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); + fwd.add_arg_slot(KERNEL_DEVICE_TYPE); + fwd.add_unchecked_arg_slot>( + PER_DEVICE_STATE); return fwd; } diff --git a/lib/task-spec/src/task-spec/ops/cast.cc b/lib/task-spec/src/task-spec/ops/cast.cc index 7cf26be95b..0c00f1be58 100644 --- a/lib/task-spec/src/task-spec/ops/cast.cc +++ b/lib/task-spec/src/task-spec/ops/cast.cc @@ -15,36 +15,45 @@ #include "task-spec/ops/cast.h" #include "kernels/cast_kernels.h" - #include "task-spec/op_task_signature.h" +#include "task-spec/profiling.h" #include "utils/hash-utils.h" using namespace FlexFlow::Kernels::Cast; namespace FlexFlow { -enum Slots { INPUT, OUTPUT, ATTRS, PROFILING }; +enum Slots { INPUT, OUTPUT, ATTRS, PROFILING, KERNEL_DEVICE_TYPE }; OpTaskInvocation forward(CastAttrs const &attrs) { OpTaskBinding binding; binding.bind_arg(PROFILING, profiling_settings()); binding.bind_arg(ATTRS, attrs); + binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); + binding.bind(INPUT, input_tensor(0_n)); + binding.bind(OUTPUT, output_tensor(0_n)); - return {task_id_t::CAST_FWD_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::CAST_FWD_TASK_ID, + binding, + }; } OpTaskInvocation backward(CastAttrs const &attrs) { OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - return {task_id_t::CAST_BWD_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::CAST_BWD_TASK_ID, + binding, + }; } static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto const &attrs = acc.get_argument(ATTRS); auto input = acc.get_tensor(INPUT); @@ -52,6 +61,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, + kernel_device_type, "[Cast] forward_time = {:.2lf}ms\n", input, output); @@ -60,6 +70,8 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto const &attrs = acc.get_argument(ATTRS); auto input = acc.get_tensor(INPUT); @@ -69,6 +81,7 @@ static std::optional return profile(backward_kernel, profiling, + kernel_device_type, "[Cast] forward_time = {:.2lf}ms\n", input_grad, output_grad); @@ -86,6 +99,7 @@ OpTaskSignature get_cast_fwd_signature() { fwd.add_arg_slot(ATTRS); fwd.add_arg_slot(PROFILING); + fwd.add_arg_slot(KERNEL_DEVICE_TYPE); fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); diff --git a/lib/task-spec/src/task-spec/ops/combine.cc b/lib/task-spec/src/task-spec/ops/combine.cc deleted file mode 100644 index 41c276facb..0000000000 --- a/lib/task-spec/src/task-spec/ops/combine.cc +++ /dev/null @@ -1,94 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "task-spec/ops/combine.h" -#include "kernels/combine_kernels.h" -#include "task-spec/op_task_invocation.h" -#include "utils/hash-utils.h" - -namespace FlexFlow { - -using namespace FlexFlow::Kernels::Combine; - -enum Slots { INPUT, OUTPUT, PROFILING }; - -OpTaskInvocation forward(CombineAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind_arg(PROFILING, profiling_settings()); - - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - - return {task_id_t::COMBINE_FWD_TASK_ID, binding}; -} - -OpTaskInvocation backward(CombineAttrs const &attrs) { - OpTaskBinding b = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::COMBINE_BWD_TASK_ID, b}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - - return profile(forward_kernel, - profiling, - "[Combine] forward_time = {:.2lf}ms\n", - input, - output); -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); - - return profile(backward_kernel, - profiling, - "[Combine] backward_time = {:.2lf}ms\n", - input_grad, - output_grad); -} - -OpTaskSignature get_combine_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_arg_slot(PROFILING); - fwd.add_input_slot(INPUT); - fwd.add_output_slot(OUTPUT); - - return fwd; -} - -OpTaskSignature get_combine_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_combine_fwd_signature()); - - return bwd; -} - -TaskImplFunction get_combine_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} -TaskImplFunction get_combine_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -}; // namespace FlexFlow diff --git a/lib/task-spec/src/task-spec/ops/concat.cc b/lib/task-spec/src/task-spec/ops/concat.cc index 2cb082d1eb..26aa64f6ec 100644 --- a/lib/task-spec/src/task-spec/ops/concat.cc +++ b/lib/task-spec/src/task-spec/ops/concat.cc @@ -16,6 +16,7 @@ #include "task-spec/ops/concat.h" #include "kernels/concat_kernels.h" #include "task-spec/op_task_signature.h" +#include "task-spec/profiling.h" #include "task-spec/variadic_tensor_ref.h" #include "utils/hash-utils.h" @@ -23,26 +24,43 @@ namespace FlexFlow { using namespace FlexFlow::Kernels::Concat; -enum Slots { INPUTS, OUTPUT, ATTRS, PROFILING, HANDLE, NUM_INPUTS }; +enum Slots { + INPUTS, + OUTPUT, + ATTRS, + PROFILING, + HANDLE, + NUM_INPUTS, + KERNEL_DEVICE_TYPE +}; OpTaskInvocation forward(ConcatAttrs const &attrs) { OpTaskBinding binding; binding.bind(INPUTS, get_input_tensors()); - binding.bind(OUTPUT, output_tensor(0)); + binding.bind(OUTPUT, output_tensor(0_n)); binding.bind_arg(PROFILING, profiling_settings()); binding.bind_arg(ATTRS, attrs); + binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); - return {task_id_t::CONCAT_FWD_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::CONCAT_FWD_TASK_ID, + binding, + }; } OpTaskInvocation backward(ConcatAttrs const &attrs) { OpTaskBinding b = infer_bwd_binding(forward(attrs).binding); - return {task_id_t::CONCAT_BWD_TASK_ID, b}; + return OpTaskInvocation{ + task_id_t::CONCAT_BWD_TASK_ID, + b, + }; } static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto const &attrs = acc.get_argument(ATTRS); auto output = acc.get_tensor(OUTPUT); @@ -52,6 +70,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, + kernel_device_type, "[Concat] forward_time = {:.2lf}ms\n", output, inputs, @@ -61,6 +80,8 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto const &attrs = acc.get_argument(ATTRS); auto input_grads = acc.get_variadic_tensor_grad(INPUTS); @@ -70,6 +91,7 @@ static std::optional return profile(backward_kernel, profiling, + kernel_device_type, "[Concat] backward_time = {:.2lf}ms\n", output_grad, input_grads, @@ -88,6 +110,7 @@ OpTaskSignature get_concat_fwd_signature() { fwd.add_arg_slot(ATTRS); fwd.add_arg_slot(PROFILING); + fwd.add_arg_slot(KERNEL_DEVICE_TYPE); fwd.add_input_slot(INPUTS, SlotType::VARIADIC); fwd.add_output_slot(OUTPUT); diff --git a/lib/task-spec/src/task-spec/ops/conv_2d.cc b/lib/task-spec/src/task-spec/ops/conv_2d.cc index ea4f7f79df..d7110eabfa 100644 --- a/lib/task-spec/src/task-spec/ops/conv_2d.cc +++ b/lib/task-spec/src/task-spec/ops/conv_2d.cc @@ -1,5 +1,6 @@ #include "task-spec/ops/conv_2d.h" #include "kernels/conv_2d_kernels.h" +#include "task-spec/profiling.h" namespace FlexFlow { @@ -13,19 +14,24 @@ enum Slots { ATTRS, PROFILING, PER_DEVICE_STATE, - HANDLE + HANDLE, + KERNEL_DEVICE_TYPE, }; OpTaskInvocation init(Conv2DAttrs const &attrs) { OpTaskBinding binding; - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - binding.bind(FILTER, weight_tensor(0)); + binding.bind(INPUT, input_tensor(0_n)); + binding.bind(OUTPUT, output_tensor(0_n)); + binding.bind(FILTER, weight_tensor(0_n)); binding.bind_arg(ATTRS, attrs); binding.bind_arg(HANDLE, ff_handle()); + binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); - return {task_id_t::CONV2D_INIT_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::CONV2D_INIT_TASK_ID, + binding, + }; } OpTaskInvocation forward(Conv2DAttrs const &attrs) { @@ -33,53 +39,68 @@ OpTaskInvocation forward(Conv2DAttrs const &attrs) { binding.bind_arg(ATTRS, attrs); binding.bind_arg(PROFILING, profiling_settings()); + binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); binding.bind_arg(PER_DEVICE_STATE, - per_device_op_state()); + per_device_op_state>()); - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - binding.bind(FILTER, weight_tensor(0)); - binding.bind(BIAS, weight_tensor(1)); + binding.bind(INPUT, input_tensor(0_n)); + binding.bind(OUTPUT, output_tensor(0_n)); + binding.bind(FILTER, weight_tensor(0_n)); + binding.bind(BIAS, weight_tensor(1_n)); - return {task_id_t::CONV2D_FWD_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::CONV2D_FWD_TASK_ID, + binding, + }; } OpTaskInvocation backward(Conv2DAttrs const &attrs) { OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - return {task_id_t::CONV2D_BWD_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::CONV2D_BWD_TASK_ID, + binding, + }; } static DeviceSpecificDeviceStates init_task_impl(TaskArgumentAccessor const &acc) { - PerDeviceFFHandle handle = acc.get_argument(HANDLE); + device_handle_t handle = acc.get_argument(HANDLE); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto attrs = acc.get_argument(ATTRS); auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); auto filter = acc.get_tensor(FILTER); auto filter_grad = acc.get_tensor_grad(FILTER); - Conv2DPerDeviceState per_device_state = - init_kernel(/*handle=*/handle, - /*activation=*/attrs.activation, - /*kernel_h=*/attrs.kernel_h.int_from_positive_int(), - /*kernel_w=*/attrs.kernel_w.int_from_positive_int(), - /*groups=*/attrs.groups.int_from_positive_int(), - /*padding_h=*/attrs.padding_h.unwrap_nonnegative(), - /*padding_w=*/attrs.padding_w.unwrap_nonnegative(), - /*stride_h=*/attrs.stride_h.int_from_positive_int(), - /*stride_w=*/attrs.stride_w.int_from_positive_int(), - /*input=*/input, - /*output=*/output, - /*filter_ptr=*/filter.get_float_ptr(), - /*filter_grad_ptr=*/filter_grad.get_float_ptr()); + std::optional per_device_state = init_kernel( + /*device_type=*/kernel_device_type, + /*handle=*/handle, + /*activation=*/attrs.activation, + /*kernel_h=*/attrs.kernel_h.int_from_positive_int(), + /*kernel_w=*/attrs.kernel_w.int_from_positive_int(), + /*groups=*/attrs.groups.int_from_positive_int(), + /*padding_h=*/attrs.padding_h.unwrap_nonnegative(), + /*padding_w=*/attrs.padding_w.unwrap_nonnegative(), + /*stride_h=*/attrs.stride_h.int_from_positive_int(), + /*stride_w=*/attrs.stride_w.int_from_positive_int(), + /*input=*/input, + /*output=*/output, + /*filter_ptr=*/filter.get_float_ptr(), + /*filter_grad_ptr=*/filter_grad.get_float_ptr()); + return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; + DeviceSpecific>::create( + per_device_state), + }; } static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); auto attrs = acc.get_argument(ATTRS); @@ -91,6 +112,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, + kernel_device_type, "[Conv2d] forward_time = {:.2lf}ms\n", per_device_state, input.get_float_ptr(), @@ -103,6 +125,8 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); auto attrs = acc.get_argument(ATTRS); @@ -118,6 +142,7 @@ static std::optional return profile(backward_kernel, profiling, + kernel_device_type, "[Conv2d] backward_time = {:.2lf}ms\n", per_device_state, output.get_float_ptr(), @@ -147,7 +172,8 @@ OpTaskSignature get_conv_2d_init_signature() { init.add_output_slot(OUTPUT); init.add_weight_slot(FILTER); init.add_arg_slot(ATTRS); - init.add_unchecked_arg_slot(HANDLE); + init.add_arg_slot(KERNEL_DEVICE_TYPE); + init.add_unchecked_arg_slot(HANDLE); init.add_return_value(); @@ -159,6 +185,7 @@ OpTaskSignature get_conv_2d_fwd_signature() { fwd.add_arg_slot(PROFILING); fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); + fwd.add_arg_slot(KERNEL_DEVICE_TYPE); fwd.add_arg_slot(ATTRS); fwd.add_input_slot(INPUT); diff --git a/lib/task-spec/src/task-spec/ops/dropout.cc b/lib/task-spec/src/task-spec/ops/dropout.cc index d19ace886b..a36506984e 100644 --- a/lib/task-spec/src/task-spec/ops/dropout.cc +++ b/lib/task-spec/src/task-spec/ops/dropout.cc @@ -2,65 +2,99 @@ #include "kernels/dropout_kernels.h" #include "task-spec/op_task_invocation.h" #include "task-spec/op_task_signature.h" +#include "task-spec/profiling.h" #include "utils/hash-utils.h" namespace FlexFlow { using namespace FlexFlow::Kernels::Dropout; -enum Slots { INPUT, OUTPUT, ATTRS, PER_DEVICE_STATE, FF_HANDLE, PROFILING }; +enum Slots { + INPUT, + OUTPUT, + ATTRS, + PER_DEVICE_STATE, + FF_HANDLE, + PROFILING, + KERNEL_DEVICE_TYPE +}; OpTaskInvocation init(DropoutAttrs const &attrs) { OpTaskBinding binding; binding.bind_arg(ATTRS, attrs); binding.bind_arg(FF_HANDLE, ff_handle()); - binding.bind(OUTPUT, output_tensor(0)); + binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); - return {task_id_t::DROPOUT_INIT_TASK_ID, binding}; + binding.bind(OUTPUT, output_tensor(0_n)); + + return OpTaskInvocation{ + task_id_t::DROPOUT_INIT_TASK_ID, + binding, + }; } OpTaskInvocation forward(DropoutAttrs const &attrs) { OpTaskBinding binding; - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); + binding.bind(INPUT, input_tensor(0_n)); + binding.bind(OUTPUT, output_tensor(0_n)); binding.bind_arg(PROFILING, profiling_settings()); + binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); binding.bind_arg(PER_DEVICE_STATE, - per_device_op_state()); + per_device_op_state>()); - return {task_id_t::DROPOUT_FWD_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::DROPOUT_FWD_TASK_ID, + binding, + }; } OpTaskInvocation backward(DropoutAttrs const &attrs) { OpTaskBinding b = infer_bwd_binding(forward(attrs).binding); - return {task_id_t::DROPOUT_BWD_TASK_ID, b}; + return OpTaskInvocation{ + task_id_t::DROPOUT_BWD_TASK_ID, + b, + }; } static DeviceSpecificDeviceStates init_task_impl(TaskArgumentAccessor const &acc) { auto output = acc.get_tensor(OUTPUT); Allocator allocator = acc.get_allocator(); - PerDeviceFFHandle handle = acc.get_argument(FF_HANDLE); + device_handle_t handle = acc.get_argument(FF_HANDLE); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto const &attrs = acc.get_argument(ATTRS); - DropoutPerDeviceState per_device_state = - init_kernel(handle, attrs.rate, attrs.seed, output.shape, allocator); + std::optional per_device_state = + init_kernel(kernel_device_type, + handle, + attrs.rate, + attrs.seed, + output.shape, + allocator); + return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; + DeviceSpecific>::create( + per_device_state), + }; } static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); + acc.get_argument>(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); return profile(forward_kernel, profiling, + kernel_device_type, "[Dropout] forward_time = {:.2lf}ms\n", per_device_state, input.get_float_ptr(), @@ -73,12 +107,15 @@ static std::optional auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); return profile(backward_kernel, profiling, + kernel_device_type, "[Dropout] backward_time = {:.2lf}ms\n", per_device_state, output_grad.get_float_ptr(), @@ -99,10 +136,11 @@ OpTaskSignature get_dropout_init_signature() { OpTaskSignature init(OpTaskType::INIT); init.add_arg_slot(ATTRS); - init.add_unchecked_arg_slot(FF_HANDLE); + init.add_arg_slot(KERNEL_DEVICE_TYPE); + init.add_unchecked_arg_slot(FF_HANDLE); init.add_output_slot(OUTPUT); - init.add_return_value(); + init.add_return_value>(); return init; } @@ -110,8 +148,10 @@ OpTaskSignature get_dropout_init_signature() { OpTaskSignature get_dropout_fwd_signature() { OpTaskSignature fwd(OpTaskType::FWD); - fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); + fwd.add_unchecked_arg_slot>( + PER_DEVICE_STATE); fwd.add_arg_slot(PROFILING); + fwd.add_arg_slot(KERNEL_DEVICE_TYPE); fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); diff --git a/lib/task-spec/src/task-spec/ops/element_binary.cc b/lib/task-spec/src/task-spec/ops/element_binary.cc index 5356901423..a5f9f012fe 100644 --- a/lib/task-spec/src/task-spec/ops/element_binary.cc +++ b/lib/task-spec/src/task-spec/ops/element_binary.cc @@ -1,5 +1,6 @@ #include "task-spec/ops/element_binary.h" #include "kernels/element_binary_kernels.h" +#include "task-spec/profiling.h" #include "task-spec/task_signature_impl.h" #include "utils/hash-utils.h" @@ -14,40 +15,55 @@ enum Slots { PROFILING, PER_DEVICE_STATE, HANDLE, - ATTRS + ATTRS, + KERNEL_DEVICE_TYPE, }; OpTaskInvocation init(ElementBinaryAttrs const &attrs) { OpTaskBinding binding; - binding.bind(LHS_INPUT, input_tensor(0)); - binding.bind(RHS_INPUT, input_tensor(1)); - binding.bind(OUTPUT, output_tensor(0)); + binding.bind(LHS_INPUT, input_tensor(0_n)); + binding.bind(RHS_INPUT, input_tensor(1_n)); + binding.bind(OUTPUT, output_tensor(0_n)); + binding.bind_arg(ATTRS, attrs); binding.bind_arg(HANDLE, ff_handle()); + binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); - return {task_id_t::ELEMENTBINARY_INIT_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::ELEMENTBINARY_INIT_TASK_ID, + binding, + }; } OpTaskInvocation forward(ElementBinaryAttrs const &attrs) { OpTaskBinding binding; - binding.bind(LHS_INPUT, input_tensor(0)); - binding.bind(RHS_INPUT, input_tensor(1)); - binding.bind(OUTPUT, output_tensor(0)); + binding.bind(LHS_INPUT, input_tensor(0_n)); + binding.bind(RHS_INPUT, input_tensor(1_n)); + binding.bind(OUTPUT, output_tensor(0_n)); + binding.bind_arg(ATTRS, attrs); binding.bind_arg(PROFILING, profiling_settings()); - binding.bind_arg(PER_DEVICE_STATE, - per_device_op_state()); + binding.bind_arg( + PER_DEVICE_STATE, + per_device_op_state>()); binding.bind_arg(HANDLE, ff_handle()); + binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); - return {task_id_t::ELEMENTBINARY_FWD_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::ELEMENTBINARY_FWD_TASK_ID, + binding, + }; } OpTaskInvocation backward(ElementBinaryAttrs const &attrs) { OpTaskBinding b = infer_bwd_binding(forward(attrs).binding); - return {task_id_t::ELEMENTBINARY_BWD_TASK_ID, b}; + return OpTaskInvocation{ + task_id_t::ELEMENTBINARY_BWD_TASK_ID, + b, + }; } static DeviceSpecificDeviceStates @@ -56,23 +72,31 @@ static DeviceSpecificDeviceStates auto input_rhs = acc.get_tensor(RHS_INPUT); auto output = acc.get_tensor(OUTPUT); - PerDeviceFFHandle handle = acc.get_argument(HANDLE); + device_handle_t handle = acc.get_argument(HANDLE); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto const &attrs = acc.get_argument(ATTRS); - ElementBinaryPerDeviceState per_device_state = - init_kernel(handle, + std::optional per_device_state = + init_kernel(kernel_device_type, + handle, attrs.type, attrs.should_broadcast_lhs, attrs.should_broadcast_rhs, input_lhs.shape, input_rhs.shape, output.shape); + return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; + DeviceSpecific>::create( + per_device_state), + }; } static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); auto const &attrs = acc.get_argument(ATTRS); @@ -80,10 +104,11 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto input_lhs = acc.get_tensor(LHS_INPUT); auto input_rhs = acc.get_tensor(RHS_INPUT); auto output = acc.get_tensor(OUTPUT); - PerDeviceFFHandle handle = acc.get_argument(HANDLE); + device_handle_t handle = acc.get_argument(HANDLE); return profile(forward_kernel, profiling, + kernel_device_type, "[ElementBinary] forward_time = {:.2lf}ms\n", per_device_state, input_lhs.get_float_ptr(), @@ -99,8 +124,10 @@ static std::optional auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto const &attrs = acc.get_argument(ATTRS); - PerDeviceFFHandle handle = acc.get_argument(HANDLE); + device_handle_t handle = acc.get_argument(HANDLE); auto input_lhs = acc.get_tensor(LHS_INPUT); auto input_rhs = acc.get_tensor(RHS_INPUT); @@ -111,6 +138,7 @@ static std::optional return profile(backward_kernel, profiling, + kernel_device_type, "[ElementBinary] backward_time = {:.2lf}ms\n", per_device_state, output_grad.get_float_ptr(), @@ -142,8 +170,10 @@ OpTaskSignature get_element_binary_init_signature() { init.add_input_slot(LHS_INPUT); init.add_input_slot(RHS_INPUT); init.add_output_slot(OUTPUT); + init.add_arg_slot(ATTRS); - init.add_unchecked_arg_slot(HANDLE); + init.add_arg_slot(KERNEL_DEVICE_TYPE); + init.add_unchecked_arg_slot(HANDLE); init.add_return_value(); @@ -156,7 +186,8 @@ OpTaskSignature get_element_binary_fwd_signature() { fwd.add_arg_slot(PROFILING); fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); fwd.add_arg_slot(ATTRS); - fwd.add_unchecked_arg_slot(HANDLE); + fwd.add_arg_slot(KERNEL_DEVICE_TYPE); + fwd.add_unchecked_arg_slot(HANDLE); fwd.add_input_slot(LHS_INPUT); fwd.add_input_slot(RHS_INPUT); diff --git a/lib/task-spec/src/task-spec/ops/element_unary.cc b/lib/task-spec/src/task-spec/ops/element_unary.cc index 1f4e651251..f8df53b578 100644 --- a/lib/task-spec/src/task-spec/ops/element_unary.cc +++ b/lib/task-spec/src/task-spec/ops/element_unary.cc @@ -1,6 +1,7 @@ #include "task-spec/ops/element_unary.h" #include "kernels/element_unary_kernels.h" #include "op-attrs/parallel_tensor_shape.h" +#include "task-spec/profiling.h" #include "utils/hash-utils.h" namespace FlexFlow { @@ -13,10 +14,12 @@ enum Slots { INPUT, INPUT_SHAPE, OUTPUT, + OUTPUT_SHAPE, ATTRS, HANDLE, PROFILING, - PER_DEVICE_STATE + PER_DEVICE_STATE, + KERNEL_DEVICE_TYPE, }; /* ElementUnary */ @@ -24,49 +27,67 @@ OpTaskInvocation init(ElementUnaryAttrs const &attrs) { OpTaskBinding b; b.bind_arg(ATTRS, attrs); - b.bind_arg(INPUT_SHAPE, input_parallel_tensor_shape(0)); + b.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); - return {task_id_t::ELEMENTUNARY_INIT_TASK_ID, b}; + b.bind_arg(INPUT_SHAPE, input_parallel_tensor_shape(0_n)); + b.bind_arg(OUTPUT_SHAPE, output_parallel_tensor_shape(0_n)); + + return OpTaskInvocation{ + task_id_t::ELEMENTUNARY_INIT_TASK_ID, + b, + }; } OpTaskInvocation forward(ElementUnaryAttrs const &attrs) { OpTaskBinding b; - b.bind(INPUT, input_tensor(0)); - b.bind(OUTPUT, output_tensor(0)); + b.bind(INPUT, input_tensor(0_n)); + b.bind(OUTPUT, output_tensor(0_n)); b.bind_arg(ATTRS, attrs); b.bind_arg(HANDLE, ff_handle()); b.bind_arg(PROFILING, profiling_settings()); + b.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); b.bind_arg(PER_DEVICE_STATE, - per_device_op_state()); + per_device_op_state>()); - return {task_id_t::ELEMENTUNARY_FWD_TASK_ID, b}; + return OpTaskInvocation{ + task_id_t::ELEMENTUNARY_FWD_TASK_ID, + b, + }; } OpTaskInvocation backward(ElementUnaryAttrs const &attrs) { OpTaskBinding b = infer_bwd_binding(forward(attrs).binding); - return {task_id_t::ELEMENTUNARY_BWD_TASK_ID, b}; + return OpTaskInvocation{ + task_id_t::ELEMENTUNARY_BWD_TASK_ID, + b, + }; } static DeviceSpecificDeviceStates init_task_impl(TaskArgumentAccessor const &acc) { auto attrs = acc.get_argument(ATTRS); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); ParallelTensorShape input_shape = acc.get_argument(INPUT_SHAPE); - ParallelTensorShape output_shape = - throw_if_unexpected(get_output_shape(attrs, input_shape)); - ElementUnaryPerDeviceState per_device_state = - init_kernel(array_shape_from_tensor_shape(get_piece_shape(input_shape)), - array_shape_from_tensor_shape(get_piece_shape(output_shape)), + acc.get_argument(OUTPUT_SHAPE); + + std::optional per_device_state = + init_kernel(kernel_device_type, + get_piece_shape(input_shape), + get_piece_shape(output_shape), attrs); return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; + DeviceSpecific>::create( + per_device_state), + }; } static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { @@ -74,14 +95,17 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto output = acc.get_tensor(OUTPUT); auto attrs = acc.get_argument(ATTRS); - auto handle = acc.get_argument(HANDLE); + auto handle = acc.get_argument(HANDLE); ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); return profile(forward_kernel, profiling, + kernel_device_type, "[ElementUnary] forward_time = {:.2lf}ms\n", per_device_state, attrs, @@ -98,14 +122,17 @@ static std::optional auto output_grad = acc.get_tensor_grad(OUTPUT); auto const &attrs = acc.get_argument(ATTRS); - auto handle = acc.get_argument(HANDLE); + auto handle = acc.get_argument(HANDLE); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); return profile(backward_kernel, profiling, + kernel_device_type, "[ElementUnary] backward_time = {:.2lf}ms\n", per_device_state, attrs, @@ -131,7 +158,8 @@ OpTaskSignature get_element_unary_init_signature() { init.add_arg_slot(INPUT_SHAPE); init.add_arg_slot(ATTRS); - init.add_unchecked_arg_slot(HANDLE); + init.add_arg_slot(KERNEL_DEVICE_TYPE); + init.add_unchecked_arg_slot(HANDLE); init.add_return_value(); @@ -145,6 +173,7 @@ OpTaskSignature get_element_unary_fwd_signature() { fwd.add_output_slot(OUTPUT); fwd.add_arg_slot(PROFILING); + fwd.add_arg_slot(KERNEL_DEVICE_TYPE); fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); return fwd; diff --git a/lib/task-spec/src/task-spec/ops/embedding.cc b/lib/task-spec/src/task-spec/ops/embedding.cc new file mode 100644 index 0000000000..4ba32c8483 --- /dev/null +++ b/lib/task-spec/src/task-spec/ops/embedding.cc @@ -0,0 +1,120 @@ +#include "task-spec/ops/embedding.h" +#include "kernels/embedding_kernels.h" +#include "task-spec/profiling.h" + +namespace FlexFlow { + +using namespace FlexFlow::Kernels::Embedding; + +enum Slots { INPUT, WEIGHT, OUTPUT, ATTRS, PROFILING, KERNEL_DEVICE_TYPE }; + +OpTaskInvocation forward(EmbeddingAttrs const &attrs) { + OpTaskBinding b; + + b.bind(INPUT, input_tensor(0_n)); + b.bind(WEIGHT, weight_tensor(0_n)); + b.bind(OUTPUT, output_tensor(0_n)); + + b.bind_arg(ATTRS, attrs); + b.bind_arg(PROFILING, profiling_settings()); + b.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); + + return OpTaskInvocation{ + task_id_t::EMBED_FWD_TASK_ID, + b, + }; +} + +OpTaskInvocation backward(EmbeddingAttrs const &attrs) { + OpTaskBinding b = infer_bwd_binding(forward(attrs).binding); + + return OpTaskInvocation{ + task_id_t::EMBED_BWD_TASK_ID, + b, + }; +} + +static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { + auto input = acc.get_tensor(INPUT); + auto weight = acc.get_tensor(WEIGHT); + auto output = acc.get_tensor(OUTPUT); + + ProfilingSettings profiling = acc.get_argument(PROFILING); + EmbeddingAttrs attrs = acc.get_argument(ATTRS); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); + + return profile( + forward_kernel, + profiling, + kernel_device_type, + "[Embedding] forward_time = {:.2lf}ms\n", + input, + output, + weight, + input.shape.data_type, + output.shape.data_type, + attrs.aggr, + get_num_dims(input.shape.dims).unwrap_nonnegative(), + get_num_dims(output.shape.dims).unwrap_nonnegative(), + dim_at_idx(input.shape.dims, legion_dim_t{1_n}).int_from_positive_int()); +} + +static std::optional + backward_task_impl(TaskArgumentAccessor const &acc) { + auto input = acc.get_tensor(INPUT); + auto output = acc.get_tensor(OUTPUT); + auto weight_grad = acc.get_tensor_grad(WEIGHT); + + ProfilingSettings profiling = acc.get_argument(PROFILING); + EmbeddingAttrs attrs = acc.get_argument(ATTRS); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); + + return profile( + backward_kernel, + profiling, + kernel_device_type, + "[Embedding] backward_time = {:.2lf}ms\n", + output, + input, + weight_grad, + output.shape.data_type, + input.shape.data_type, + attrs.aggr, + get_num_dims(input.shape.dims).unwrap_nonnegative(), + get_num_dims(output.shape.dims).unwrap_nonnegative(), + dim_at_idx(input.shape.dims, ff_dim_t{0_n}).int_from_positive_int()); +} + +TaskImplFunction get_embedding_fwd_task_impl() { + return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; +} +TaskImplFunction get_embedding_bwd_task_impl() { + return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; +} + +OpTaskSignature get_embedding_fwd_signature() { + OpTaskSignature fwd(OpTaskType::FWD); + + fwd.add_input_slot(INPUT); + fwd.add_input_slot(OUTPUT); + fwd.add_input_slot(WEIGHT); + + fwd.add_arg_slot(ATTRS); + fwd.add_arg_slot(PROFILING); + fwd.add_arg_slot(KERNEL_DEVICE_TYPE); + + return fwd; +} + +OpTaskSignature get_embedding_bwd_signature() { + OpTaskSignature bwd = infer_bwd_signature(get_embedding_fwd_signature()); + return bwd; +} + +std::vector get_task_ids(EmbeddingAttrs const &) { + return {task_id_t::EMBED_FWD_TASK_ID, task_id_t::EMBED_BWD_TASK_ID}; +} + +} // namespace FlexFlow diff --git a/lib/task-spec/src/task-spec/ops/flat.cc b/lib/task-spec/src/task-spec/ops/flat.cc index 1bc0999e1a..6cec1b383f 100644 --- a/lib/task-spec/src/task-spec/ops/flat.cc +++ b/lib/task-spec/src/task-spec/ops/flat.cc @@ -1,35 +1,47 @@ #include "task-spec/ops/flat.h" #include "kernels/flat_kernels.h" +#include "task-spec/profiling.h" namespace FlexFlow { using namespace FlexFlow::Kernels::Flat; -enum SLOTS { INPUT, OUTPUT, HANDLE, PROFILING }; +enum SLOTS { INPUT, OUTPUT, HANDLE, PROFILING, KERNEL_DEVICE_TYPE }; OpTaskInvocation forward(FlatAttrs const &attrs) { OpTaskBinding binding; - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); + binding.bind(INPUT, input_tensor(0_n)); + binding.bind(OUTPUT, output_tensor(0_n)); binding.bind_arg(PROFILING, profiling_settings()); - return {task_id_t::FLAT_FWD_TASK_ID, binding}; + binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); + + return OpTaskInvocation{ + task_id_t::FLAT_FWD_TASK_ID, + binding, + }; } OpTaskInvocation backward(FlatAttrs const &attrs) { OpTaskBinding b = infer_bwd_binding(forward(attrs).binding); - return {task_id_t::FLAT_BWD_TASK_ID, b}; + return OpTaskInvocation{ + task_id_t::FLAT_BWD_TASK_ID, + b, + }; } static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); return profile(forward_kernel, profiling, + kernel_device_type, "[Flat] forward_time = {:.2lf}ms\n", input, output.get_float_ptr()); @@ -38,6 +50,8 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto input = acc.get_tensor(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); @@ -45,6 +59,7 @@ static std::optional return profile(backward_kernel, profiling, + kernel_device_type, "[Flat] backward_time = {:.2lf}ms\n", input, output_grad.get_float_ptr(), @@ -62,6 +77,7 @@ OpTaskSignature get_flat_fwd_signature() { OpTaskSignature fwd(OpTaskType::FWD); fwd.add_arg_slot(PROFILING); + fwd.add_arg_slot(KERNEL_DEVICE_TYPE); fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); diff --git a/lib/task-spec/src/task-spec/ops/gather.cc b/lib/task-spec/src/task-spec/ops/gather.cc index 5f7173a991..7f8aacf9d6 100644 --- a/lib/task-spec/src/task-spec/ops/gather.cc +++ b/lib/task-spec/src/task-spec/ops/gather.cc @@ -15,6 +15,8 @@ #include "task-spec/ops/gather.h" #include "kernels/gather_kernels.h" +#include "op-attrs/ff_ordered/get_idxs.h" +#include "task-spec/profiling.h" #include "utils/nonnegative_int/nonnegative_range.h" #include @@ -22,18 +24,31 @@ namespace FlexFlow { using namespace FlexFlow::Kernels::Gather; -enum Slots { INPUT, OUTPUT, INDEX, ATTRS, HANDLE, PROFILING, PER_DEVICE_STATE }; +enum Slots { + INPUT, + OUTPUT, + INDEX, + ATTRS, + HANDLE, + PROFILING, + PER_DEVICE_STATE, + KERNEL_DEVICE_TYPE +}; OpTaskInvocation init(GatherAttrs const &attrs) { OpTaskBinding binding; - binding.bind(INPUT, input_tensor(0)); - binding.bind(INDEX, input_tensor(1)); - binding.bind(OUTPUT, output_tensor(0)); + binding.bind(INPUT, input_tensor(0_n)); + binding.bind(INDEX, input_tensor(1_n)); + binding.bind(OUTPUT, output_tensor(0_n)); binding.bind_arg(ATTRS, attrs); binding.bind_arg(HANDLE, ff_handle()); + binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); - return {task_id_t::GATHER_INIT_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::GATHER_INIT_TASK_ID, + binding, + }; } OpTaskInvocation forward(GatherAttrs const &attrs) { @@ -41,20 +56,27 @@ OpTaskInvocation forward(GatherAttrs const &attrs) { binding.bind_arg(ATTRS, attrs); binding.bind_arg(PROFILING, profiling_settings()); + binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); binding.bind_arg(PER_DEVICE_STATE, - per_device_op_state()); + per_device_op_state>()); - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - binding.bind(INDEX, weight_tensor(0)); + binding.bind(INPUT, input_tensor(0_n)); + binding.bind(OUTPUT, output_tensor(0_n)); + binding.bind(INDEX, weight_tensor(0_n)); - return {task_id_t::GATHER_FWD_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::GATHER_FWD_TASK_ID, + binding, + }; } OpTaskInvocation backward(GatherAttrs const &attrs) { OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - return {task_id_t::GATHER_BWD_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::GATHER_BWD_TASK_ID, + binding, + }; } static DeviceSpecificDeviceStates @@ -63,29 +85,34 @@ static DeviceSpecificDeviceStates auto index = acc.get_tensor(INDEX); auto output = acc.get_tensor(OUTPUT); - PerDeviceFFHandle handle = acc.get_argument(HANDLE); + device_handle_t handle = acc.get_argument(HANDLE); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto const &attrs = acc.get_argument(ATTRS); - legion_dim_t legion_dim = - legion_dim_from_ff_dim(attrs.dim, input.shape.num_dims()); - assert(input.shape.num_dims() == index.shape.num_dims()); - assert(output.shape.num_dims() == index.shape.num_dims()); + ASSERT(get_num_dims(input.shape.dims) == get_num_dims(index.shape.dims)); + ASSERT(get_num_dims(output.shape.dims) == get_num_dims(index.shape.dims)); - for (nonnegative_int i : nonnegative_range(input.shape.num_dims())) { - assert(index.shape.at(legion_dim_t{i}) == output.shape.at(legion_dim_t{i})); - if (i != legion_dim.value) { - assert(input.shape.at(legion_dim_t{i}) == - index.shape.at(legion_dim_t{i})); + for (ff_dim_t i : get_idxs(input.shape.dims.ff_ordered)) { + ASSERT(dim_at_idx(index.shape.dims, i) == dim_at_idx(output.shape.dims, i)); + if (i != attrs.dim) { + ASSERT(dim_at_idx(input.shape.dims, i) == + dim_at_idx(index.shape.dims, i)); } } - GatherPerDeviceState per_device_state = {handle, legion_dim}; + std::optional per_device_state = + init_kernel(kernel_device_type, handle, attrs.dim); return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; + DeviceSpecific>::create( + per_device_state), + }; } static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); @@ -95,6 +122,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, + kernel_device_type, "[Gather] forward_time = {:.2lf}ms\n", per_device_state, input, @@ -105,6 +133,8 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); @@ -114,6 +144,7 @@ static std::optional return profile(backward_kernel, profiling, + kernel_device_type, "[Gather] backward_time = {:.2lf}ms\n", per_device_state, output_grad, @@ -139,7 +170,8 @@ OpTaskSignature get_gather_init_signature() { init.add_output_slot(OUTPUT); init.add_arg_slot(ATTRS); - init.add_unchecked_arg_slot(HANDLE); + init.add_arg_slot(KERNEL_DEVICE_TYPE); + init.add_unchecked_arg_slot(HANDLE); init.add_return_value(); @@ -150,6 +182,7 @@ OpTaskSignature get_gather_fwd_signature() { OpTaskSignature fwd(OpTaskType::FWD); fwd.add_arg_slot(PROFILING); + fwd.add_arg_slot(KERNEL_DEVICE_TYPE); fwd.add_arg_slot(ATTRS); fwd.add_input_slot(INPUT); diff --git a/lib/task-spec/src/task-spec/ops/layer_norm.cc b/lib/task-spec/src/task-spec/ops/layer_norm.cc index 8db2281bcf..b37e63c2d1 100644 --- a/lib/task-spec/src/task-spec/ops/layer_norm.cc +++ b/lib/task-spec/src/task-spec/ops/layer_norm.cc @@ -15,8 +15,11 @@ #include "task-spec/ops/layer_norm.h" #include "kernels/layer_norm_kernels.h" +#include "op-attrs/ff_ordered/transform.h" #include "op-attrs/ops/layer_norm.h" #include "op-attrs/parallel_tensor_shape.h" +#include "task-spec/profiling.h" +#include "utils/containers/product.h" #include "utils/exception.h" #include "utils/hash-utils.h" #include "utils/nonnegative_int/nonnegative_range.h" @@ -34,37 +37,50 @@ enum Slots { BETA, PER_DEVICE_STATE, ATTRS, - HANDLE + HANDLE, + KERNEL_DEVICE_TYPE, }; OpTaskInvocation init(LayerNormAttrs const &attrs) { OpTaskBinding b; - b.bind(INPUT, input_tensor(0)); + b.bind(INPUT, input_tensor(0_n)); b.bind_arg(HANDLE, ff_handle()); + b.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); b.bind_arg(ATTRS, attrs); - return {task_id_t::LAYERNORM_INIT_TASK_ID, b}; + return OpTaskInvocation{ + task_id_t::LAYERNORM_INIT_TASK_ID, + b, + }; } OpTaskInvocation forward(LayerNormAttrs const &attrs) { OpTaskBinding b; - b.bind(INPUT, input_tensor(0)); - b.bind(OUTPUT, output_tensor(0)); - b.bind(GAMMA, weight_tensor(0)); // todo, this may have some problem - b.bind(BETA, weight_tensor(1)); // how to get gmmam and beta + b.bind(INPUT, input_tensor(0_n)); + b.bind(OUTPUT, output_tensor(0_n)); + b.bind(GAMMA, weight_tensor(0_n)); + b.bind(BETA, weight_tensor(1_n)); b.bind_arg(PROFILING, profiling_settings()); - b.bind_arg(PER_DEVICE_STATE, per_device_op_state()); - - return {task_id_t::LAYERNORM_FWD_TASK_ID, b}; + b.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); + b.bind_arg(PER_DEVICE_STATE, + per_device_op_state>()); + + return OpTaskInvocation{ + task_id_t::LAYERNORM_FWD_TASK_ID, + b, + }; } OpTaskInvocation backward(LayerNormAttrs const &attrs) { OpTaskBinding b = infer_bwd_binding(forward(attrs).binding); - return {task_id_t::LAYERNORM_BWD_TASK_ID, b}; + return OpTaskInvocation{ + task_id_t::LAYERNORM_BWD_TASK_ID, + b, + }; } static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { @@ -74,10 +90,13 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto beta = acc.get_tensor(BETA); ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto &state = acc.get_argument(PER_DEVICE_STATE); return profile(forward_kernel, profiling, + kernel_device_type, "[LayerNorm] forward time = {:.2lf}ms\n", state, input, @@ -97,10 +116,13 @@ static std::optional auto output_grad = acc.get_tensor_grad(OUTPUT); ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto &state = acc.get_argument(PER_DEVICE_STATE); return profile(backward_kernel, profiling, + kernel_device_type, "[LayerNorm] backward time = {:.2lf}ms\n", state, output_grad, @@ -114,33 +136,35 @@ static std::optional static DeviceSpecificDeviceStates init_task_impl(TaskArgumentAccessor const &acc) { auto const &attrs = acc.get_argument(ATTRS); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); Allocator allocator = acc.get_allocator(); auto input = acc.get_tensor(INPUT); - auto handle = acc.get_argument(HANDLE); - - positive_int M = 1_p; - for (int i = 0; i < attrs.axes.size(); i++) { - legion_dim_t legion_dim = - legion_dim_from_ff_dim(attrs.axes[i], input.shape.num_dims()); - M *= input.shape.at(legion_dim); - } - positive_int num_replicas = 1_p; - for (nonnegative_int i : nonnegative_range(input.shape.num_dims())) { - num_replicas *= input.shape.at(legion_dim_t{i}); - } + auto handle = acc.get_argument(HANDLE); + + positive_int M = product(transform(attrs.axes, [&](ff_dim_t dim) { + return dim_at_idx(input.shape.dims, dim); + })); + + positive_int num_replicas = get_num_elements(input.shape.dims); + positive_int effective_num_elements = M; positive_int effective_batch_size = - positive_int{input.shape.num_elements() / M}; + positive_int{get_num_elements(input.shape.dims) / M}; - LayerNormPerDeviceState per_device_state = - init_kernel(handle, + std::optional per_device_state = + init_kernel(kernel_device_type, + handle, allocator, attrs.elementwise_affine, effective_batch_size.int_from_positive_int(), effective_num_elements.int_from_positive_int(), attrs.eps); + return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; + DeviceSpecific>::create( + per_device_state), + }; } TaskImplFunction get_layer_norm_init_task_impl() { @@ -162,6 +186,7 @@ OpTaskSignature get_layer_norm_fwd_signature() { fwd.add_weight_slot(BETA); fwd.add_arg_slot(PROFILING); + fwd.add_arg_slot(KERNEL_DEVICE_TYPE); fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); return fwd; } @@ -176,7 +201,8 @@ OpTaskSignature get_layer_norm_init_signature() { init.add_input_slot(INPUT); init.add_arg_slot(ATTRS); - init.add_unchecked_arg_slot(HANDLE); + init.add_arg_slot(KERNEL_DEVICE_TYPE); + init.add_unchecked_arg_slot(HANDLE); init.add_return_value(); return init; diff --git a/lib/task-spec/src/task-spec/ops/linear.cc b/lib/task-spec/src/task-spec/ops/linear.cc index e8be7781f5..9ce02bc7fd 100644 --- a/lib/task-spec/src/task-spec/ops/linear.cc +++ b/lib/task-spec/src/task-spec/ops/linear.cc @@ -1,15 +1,14 @@ #include "task-spec/ops/linear.h" -#include "kernels/linear_kernels.h" #include "kernels/format_accessor_contents.h" +#include "kernels/linear_kernels.h" #include "op-attrs/ff_dim_t.h" +#include "task-spec/profiling.h" #include "task-spec/task_argument_accessor.h" #include "utils/exception.h" #include "utils/hash-utils.h" namespace FlexFlow { -using namespace FlexFlow::Kernels::Linear; - enum slots { INPUT, OUTPUT, @@ -18,72 +17,87 @@ enum slots { ATTRS, PROFILING, HANDLE, - PER_DEVICE_STATE + PER_DEVICE_STATE, + KERNEL_DEVICE_TYPE, }; OpTaskInvocation init(LinearAttrs const &attrs) { OpTaskBinding binding; binding.bind_arg(HANDLE, ff_handle()); + binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); binding.bind_arg(ATTRS, attrs); - binding.bind(INPUT, input_tensor(0)); - binding.bind(WEIGHT, weight_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); + binding.bind(INPUT, input_tensor(0_n)); + binding.bind(WEIGHT, weight_tensor(0_n)); + binding.bind(OUTPUT, output_tensor(0_n)); - return {task_id_t::LINEAR_INIT_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::LINEAR_INIT_TASK_ID, + binding, + }; } OpTaskInvocation forward(LinearAttrs const &attrs) { OpTaskBinding binding; - binding.bind(INPUT, input_tensor(0)); - binding.bind(WEIGHT, weight_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); + binding.bind(INPUT, input_tensor(0_n)); + binding.bind(WEIGHT, weight_tensor(0_n)); + binding.bind(OUTPUT, output_tensor(0_n)); if (attrs.use_bias) { - binding.bind(BIAS, weight_tensor(1)); + binding.bind(BIAS, weight_tensor(1_n)); } binding.bind_arg(PROFILING, profiling_settings()); + binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); binding.bind_arg(PER_DEVICE_STATE, - per_device_op_state()); + per_device_op_state>()); binding.bind_arg(ATTRS, attrs); - return {task_id_t::LINEAR_FWD_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::LINEAR_FWD_TASK_ID, + binding, + }; } OpTaskInvocation backward(LinearAttrs const &attrs) { OpTaskBinding b = infer_bwd_binding(forward(attrs).binding); - return {task_id_t::LINEAR_BWD_TASK_ID, b}; + return OpTaskInvocation{ + task_id_t::LINEAR_BWD_TASK_ID, + b, + }; } static DeviceSpecificDeviceStates init_task_impl(TaskArgumentAccessor const &acc) { auto const &attrs = acc.get_argument(ATTRS); - PerDeviceFFHandle handle = acc.get_argument(HANDLE); + device_handle_t handle = acc.get_argument(HANDLE); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto input = acc.get_tensor(INPUT); auto weight = acc.get_tensor(WEIGHT); auto output = acc.get_tensor(OUTPUT); - positive_int out_dim = output.shape.at(ff_dim_t{0_n}); - positive_int batch_size = output.shape.at(ff_dim_t{1_n}); - - float *one_ptr; - - LinearPerDeviceState per_device_state = - init_kernel(handle, - one_ptr, - attrs.activation, - attrs.regularizer, - attrs.use_bias, - input.data_type, - weight.data_type, - output.data_type, - batch_size.int_from_positive_int(), - attrs.out_channels.int_from_positive_int()); + positive_int out_dim = dim_at_idx(output.shape.dims, ff_dim_t{0_n}); + positive_int batch_size = dim_at_idx(output.shape.dims, ff_dim_t{1_n}); + + std::optional per_device_state = + linear_init_kernel(kernel_device_type, + handle, + attrs.activation, + attrs.regularizer, + attrs.use_bias, + input.shape.data_type, + weight.shape.data_type, + output.shape.data_type, + batch_size.int_from_positive_int(), + attrs.out_channels.int_from_positive_int()); + return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; + DeviceSpecific>::create( + per_device_state), + }; } static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { @@ -92,31 +106,27 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto output = acc.get_tensor(OUTPUT); auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); + acc.get_argument>(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto attrs = acc.get_argument(ATTRS); - positive_int in_dim = input.shape.at(ff_dim_t{0_n}); - positive_int out_dim = output.shape.at(ff_dim_t{0_n}); - positive_int batch_size = positive_int{output.shape.num_elements() / out_dim}; - - float const *bias_ptr = NULL; + std::optional bias = std::nullopt; if (attrs.use_bias) { - auto bias = acc.get_tensor(BIAS); - bias_ptr = bias.get_float_ptr(); + bias = acc.get_tensor(BIAS); } - auto result = profile(forward_kernel, - profiling, - "[Linear] forward_time = {:.2lf}ms\n", - per_device_state, - input.get_float_ptr(), - output.get_float_ptr(), - weight.get_float_ptr(), - bias_ptr, - in_dim.int_from_positive_int(), - out_dim.int_from_positive_int(), - batch_size.int_from_positive_int()); + auto result = profile(linear_forward_kernel, + profiling, + kernel_device_type, + "[Linear] forward_time = {:.2lf}ms\n", + per_device_state, + attrs, + input, + output, + weight, + bias); return result; } @@ -132,34 +142,30 @@ static std::optional auto output_grad = acc.get_tensor_grad(OUTPUT); auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); + acc.get_argument>(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto attrs = acc.get_argument(ATTRS); - float *bias_grad_ptr = NULL; + std::optional bias_grad = std::nullopt; if (attrs.use_bias) { - auto bias_grad = acc.get_tensor_grad(BIAS); - bias_grad_ptr = bias_grad.get_float_ptr(); + bias_grad = acc.get_tensor(BIAS); } - positive_int in_dim = input.shape.at(ff_dim_t{0_n}); - positive_int out_dim = output.shape.at(ff_dim_t{0_n}); - positive_int batch_size = positive_int{output.shape.num_elements() / out_dim}; - - auto result = profile(backward_kernel, - profiling, - "[Linear] backward_time = {:.2lf}ms\n", - per_device_state, - output.get_float_ptr(), - output_grad.get_float_ptr(), - input.get_float_ptr(), - input_grad.get_float_ptr(), - weight.get_float_ptr(), - weight_grad.get_float_ptr(), - bias_grad_ptr, - in_dim.int_from_positive_int(), - out_dim.int_from_positive_int(), - batch_size.int_from_positive_int()); + auto result = profile(linear_backward_kernel, + profiling, + kernel_device_type, + "[Linear] backward_time = {:.2lf}ms\n", + per_device_state, + attrs, + output, + output_grad, + input, + input_grad, + weight, + weight_grad, + bias_grad); return result; } @@ -167,9 +173,11 @@ static std::optional TaskImplFunction get_linear_init_task_impl() { return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } + TaskImplFunction get_linear_fwd_task_impl() { return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } + TaskImplFunction get_linear_bwd_task_impl() { return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } @@ -182,6 +190,7 @@ OpTaskSignature get_linear_init_signature() { init.add_output_slot(OUTPUT); init.add_arg_slot(ATTRS); + init.add_arg_slot(KERNEL_DEVICE_TYPE); init.add_unchecked_arg_slot(HANDLE); init.add_return_value(); @@ -197,6 +206,7 @@ OpTaskSignature get_linear_fwd_signature() { fwd.add_output_slot(OUTPUT); fwd.add_arg_slot(PROFILING); + fwd.add_arg_slot(KERNEL_DEVICE_TYPE); fwd.add_arg_slot(ATTRS); fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); return fwd; diff --git a/lib/task-spec/src/task-spec/ops/pool_2d.cc b/lib/task-spec/src/task-spec/ops/pool_2d.cc index bceced61d3..20707acb2d 100644 --- a/lib/task-spec/src/task-spec/ops/pool_2d.cc +++ b/lib/task-spec/src/task-spec/ops/pool_2d.cc @@ -1,6 +1,7 @@ #include "task-spec/ops/pool_2d.h" #include "kernels/pool_2d_kernels.h" #include "op-attrs/ops/pool_2d.h" +#include "task-spec/profiling.h" #include "utils/exception.h" #include "utils/hash-utils.h" @@ -8,16 +9,28 @@ using namespace FlexFlow::Kernels::Pool2D; namespace FlexFlow { -enum Slots { INPUT, OUTPUT, ATTRS, PROFILING, PER_DEVICE_STATE, HANDLE }; +enum Slots { + INPUT, + OUTPUT, + ATTRS, + PROFILING, + PER_DEVICE_STATE, + HANDLE, + KERNEL_DEVICE_TYPE +}; OpTaskInvocation init(Pool2DAttrs const &attrs) { OpTaskBinding binding; - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); + binding.bind(INPUT, input_tensor(0_n)); + binding.bind(OUTPUT, output_tensor(0_n)); binding.bind_arg(ATTRS, attrs); binding.bind_arg(HANDLE, ff_handle()); + binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); - return {task_id_t::POOL2D_INIT_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::POOL2D_INIT_TASK_ID, + binding, + }; } static nonnegative_int calculate_padding(nonnegative_int output_size, @@ -37,22 +50,25 @@ static nonnegative_int calculate_padding(nonnegative_int output_size, static DeviceSpecificDeviceStates init_task_impl(TaskArgumentAccessor const &acc) { auto const &attrs = acc.get_argument(ATTRS); - PerDeviceFFHandle handle = acc.get_argument(HANDLE); + device_handle_t handle = acc.get_argument(HANDLE); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); - positive_int input_w = input.shape.at(ff_dim_t{0_n}); - positive_int input_h = input.shape.at(ff_dim_t{1_n}); - positive_int input_c = input.shape.at(ff_dim_t{2_n}); - positive_int input_n = input.shape.at(ff_dim_t{3_n}); - positive_int output_w = output.shape.at(ff_dim_t{0_n}); - positive_int output_h = output.shape.at(ff_dim_t{1_n}); - positive_int output_c = output.shape.at(ff_dim_t{2_n}); - positive_int output_n = output.shape.at(ff_dim_t{3_n}); - - Pool2DPerDeviceState per_device_state = - init_kernel(handle, + positive_int input_w = dim_at_idx(input.shape.dims, ff_dim_t{0_n}); + positive_int input_h = dim_at_idx(input.shape.dims, ff_dim_t{1_n}); + positive_int input_c = dim_at_idx(input.shape.dims, ff_dim_t{2_n}); + positive_int input_n = dim_at_idx(input.shape.dims, ff_dim_t{3_n}); + positive_int output_w = dim_at_idx(output.shape.dims, ff_dim_t{0_n}); + positive_int output_h = dim_at_idx(output.shape.dims, ff_dim_t{1_n}); + positive_int output_c = dim_at_idx(output.shape.dims, ff_dim_t{2_n}); + positive_int output_n = dim_at_idx(output.shape.dims, ff_dim_t{3_n}); + + std::optional per_device_state = + init_kernel(kernel_device_type, + handle, attrs.activation, input_w.int_from_positive_int(), input_h.int_from_positive_int(), @@ -71,29 +87,40 @@ static DeviceSpecificDeviceStates attrs.pool_type); return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; + DeviceSpecific>::create( + per_device_state), + }; } OpTaskInvocation forward(Pool2DAttrs const &attrs) { OpTaskBinding binding; - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); + binding.bind(INPUT, input_tensor(0_n)); + binding.bind(OUTPUT, output_tensor(0_n)); binding.bind_arg(PROFILING, profiling_settings()); + binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); binding.bind_arg(PER_DEVICE_STATE, - per_device_op_state()); + per_device_op_state>()); - return {task_id_t::POOL2D_FWD_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::POOL2D_FWD_TASK_ID, + binding, + }; } OpTaskInvocation backward(Pool2DAttrs const &attrs) { OpTaskBinding b = infer_bwd_binding(forward(attrs).binding); - return {task_id_t::POOL2D_BWD_TASK_ID, b}; + return OpTaskInvocation{ + task_id_t::POOL2D_BWD_TASK_ID, + b, + }; } static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); Pool2DPerDeviceState state = acc.get_argument(PER_DEVICE_STATE); @@ -102,6 +129,7 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { return profile(forward_kernel, profiling, + kernel_device_type, "[Pool2D] forward_time = {:.2lf}ms\n", state, input.get_float_ptr(), @@ -111,6 +139,8 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); Pool2DPerDeviceState state = acc.get_argument(PER_DEVICE_STATE); @@ -121,6 +151,7 @@ static std::optional return profile(backward_kernel, profiling, + kernel_device_type, "[Pool2D] backward_time = {:.2lf}ms\n", state, output.get_float_ptr(), @@ -132,9 +163,11 @@ static std::optional TaskImplFunction get_pool_2d_init_task_impl() { return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } + TaskImplFunction get_pool_2d_fwd_task_impl() { return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } + TaskImplFunction get_pool_2d_bwd_task_impl() { return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } @@ -146,21 +179,25 @@ OpTaskSignature get_pool_2d_init_signature() { init.add_output_slot(OUTPUT); init.add_arg_slot(ATTRS); - init.add_unchecked_arg_slot(HANDLE); + init.add_arg_slot(KERNEL_DEVICE_TYPE); + init.add_unchecked_arg_slot(HANDLE); init.add_return_value(); return init; } + OpTaskSignature get_pool_2d_fwd_signature() { OpTaskSignature fwd(OpTaskType::FWD); fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); fwd.add_arg_slot(PROFILING); + fwd.add_arg_slot(KERNEL_DEVICE_TYPE); fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); return fwd; } + OpTaskSignature get_pool_2d_bwd_signature() { OpTaskSignature bwd = infer_bwd_signature(get_pool_2d_fwd_signature()); return bwd; diff --git a/lib/task-spec/src/task-spec/ops/reduce.cc b/lib/task-spec/src/task-spec/ops/reduce.cc index 3efac36c3f..d8818393ec 100644 --- a/lib/task-spec/src/task-spec/ops/reduce.cc +++ b/lib/task-spec/src/task-spec/ops/reduce.cc @@ -1,5 +1,6 @@ #include "task-spec/ops/reduce.h" #include "kernels/reduce_kernels.h" +#include "task-spec/profiling.h" #include "utils/exception.h" #include "utils/hash-utils.h" #include "utils/type_traits_core.h" @@ -15,24 +16,31 @@ enum Slots { PROFILING, REDUCE, PER_DEVICE_STATE, - HANDLE + HANDLE, + KERNEL_DEVICE_TYPE, }; OpTaskInvocation init(ReduceAttrs const &attrs) { OpTaskBinding binding; binding.bind_arg(HANDLE, ff_handle()); + binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); binding.bind_arg(ATTRS, attrs); - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); + binding.bind(INPUT, input_tensor(0_n)); + binding.bind(OUTPUT, output_tensor(0_n)); - return {task_id_t::REDUCE_INIT_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::REDUCE_INIT_TASK_ID, + binding, + }; } static DeviceSpecificDeviceStates init_task_impl(TaskArgumentAccessor const &acc) { - PerDeviceFFHandle handle = acc.get_argument(HANDLE); + device_handle_t handle = acc.get_argument(HANDLE); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto attrs = acc.get_argument(ATTRS); auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); @@ -40,15 +48,20 @@ static DeviceSpecificDeviceStates OperatorType op_type = attrs.op_type; nonnegative_int reduction_size = - input.shape.num_elements() / output.shape.num_elements(); - ReducePerDeviceState per_device_state = - init_kernel(handle, + get_num_elements(input.shape.dims) / get_num_elements(output.shape.dims); + + std::optional per_device_state = + init_kernel(kernel_device_type, + handle, op_type, reduction_size.unwrap_nonnegative(), input.shape, output.shape); + return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; + DeviceSpecific>::create( + per_device_state), + }; } // Note: forward_kernel only needs ReducePerDeviceState, input, output @@ -56,25 +69,32 @@ OpTaskInvocation forward(ReduceAttrs const &attrs) { OpTaskBinding binding; binding.bind_arg(PER_DEVICE_STATE, - per_device_op_state()); + per_device_op_state>()); binding.bind_arg(PROFILING, profiling_settings()); + binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); + binding.bind(INPUT, input_tensor(0_n)); + binding.bind(OUTPUT, output_tensor(0_n)); - return {task_id_t::REDUCE_FWD_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::REDUCE_FWD_TASK_ID, + binding, + }; } static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); return profile(forward_kernel, profiling, + kernel_device_type, "[Reduce] forward_time = {:.2lf}ms\n", per_device_state, input.get_float_ptr(), @@ -84,7 +104,10 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { OpTaskInvocation backward(ReduceAttrs const &attrs) { OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - return {task_id_t::REDUCE_BWD_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::REDUCE_BWD_TASK_ID, + binding, + }; } static std::optional @@ -92,12 +115,15 @@ static std::optional auto per_device_state = acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); return profile(backward_kernel, profiling, + kernel_device_type, "[Reduce] backward_time = {:.2lf}ms\n", per_device_state, output_grad.get_float_ptr(), @@ -107,9 +133,11 @@ static std::optional TaskImplFunction get_reduce_init_task_impl() { return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } + TaskImplFunction get_reduce_fwd_task_impl() { return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } + TaskImplFunction get_reduce_bwd_task_impl() { return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } @@ -117,22 +145,26 @@ TaskImplFunction get_reduce_bwd_task_impl() { OpTaskSignature get_reduce_init_signature() { OpTaskSignature init(OpTaskType::INIT); - init.add_unchecked_arg_slot(HANDLE); + init.add_unchecked_arg_slot(HANDLE); init.add_arg_slot(ATTRS); + init.add_arg_slot(KERNEL_DEVICE_TYPE); init.add_return_value(); return init; } + OpTaskSignature get_reduce_fwd_signature() { OpTaskSignature fwd(OpTaskType::FWD); fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); fwd.add_arg_slot(PROFILING); + fwd.add_arg_slot(KERNEL_DEVICE_TYPE); fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); return fwd; } + OpTaskSignature get_reduce_bwd_signature() { OpTaskSignature bwd = infer_bwd_signature(get_reduce_fwd_signature()); return bwd; diff --git a/lib/task-spec/src/task-spec/ops/reduction.cc b/lib/task-spec/src/task-spec/ops/reduction.cc deleted file mode 100644 index 48f4c0e98d..0000000000 --- a/lib/task-spec/src/task-spec/ops/reduction.cc +++ /dev/null @@ -1,101 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "task-spec/ops/reduction.h" -#include "kernels/reduction_kernels.h" -#include "utils/exception.h" -#include "utils/hash-utils.h" - -namespace FlexFlow { - -using namespace FlexFlow::Kernels::Reduction; - -enum Slots { INPUT, OUTPUT, ATTRS, PROFILING }; - -OpTaskInvocation forward(ReductionAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind_arg(PROFILING, profiling_settings()); - binding.bind_arg(ATTRS, attrs); - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - - return {task_id_t::REDUCTION_FWD_TASK_ID, binding}; -} - -OpTaskInvocation backward(ReductionAttrs const &attrs) { - OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::REDUCTION_BWD_TASK_ID, binding}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling_settings = - acc.get_argument(PROFILING); - - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - auto attrs = acc.get_argument(ATTRS); - - positive_int num_replicas = attrs.reduction_degree; - - return profile(forward_kernel, - profiling_settings, - "[Reduction] forward_time = {:.2lf}ms\n", - input, - output, - num_replicas.int_from_positive_int()); -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - - auto output_grad = acc.get_tensor_grad(OUTPUT); - auto input_grad = acc.get_tensor_grad(INPUT); - return profile(backward_kernel, - profiling, - "[Reduction] backward_time = {:.2lf}ms\n", - output_grad, - input_grad); -} - -TaskImplFunction get_reduction_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} -TaskImplFunction get_reduction_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -OpTaskSignature get_reduction_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_arg_slot(PROFILING); - fwd.add_arg_slot(ATTRS); - - fwd.add_input_slot(INPUT); - fwd.add_output_slot(OUTPUT); - return fwd; -} -OpTaskSignature get_reduction_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_reduction_fwd_signature()); - return bwd; -} - -std::vector get_task_ids(ReductionAttrs const &) { - return {task_id_t::REDUCTION_FWD_TASK_ID, task_id_t::REDUCTION_BWD_TASK_ID}; -} - -}; // namespace FlexFlow diff --git a/lib/task-spec/src/task-spec/ops/repartition.cc b/lib/task-spec/src/task-spec/ops/repartition.cc deleted file mode 100644 index cfc45dede7..0000000000 --- a/lib/task-spec/src/task-spec/ops/repartition.cc +++ /dev/null @@ -1,137 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "task-spec/ops/repartition.h" -#include "kernels/partition_kernels.h" -#include "utils/exception.h" -#include "utils/hash-utils.h" - -namespace FlexFlow { - -using namespace FlexFlow::Kernels::Repartition; - -enum Slots { INPUT, OUTPUT, ATTRS, PROFILING, HANDLE, PER_DEVICE_STATE }; - -OpTaskInvocation init(RepartitionAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind_arg(HANDLE, ff_handle()); - binding.bind(INPUT, input_tensor(0)); - - return {task_id_t::REPARTITION_INIT_TASK_ID, binding}; -} - -OpTaskInvocation forward(RepartitionAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind_arg(PROFILING, profiling_settings()); - binding.bind_arg(ATTRS, attrs); - binding.bind_arg(PER_DEVICE_STATE, - per_device_op_state()); - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - - return {task_id_t::REPARTITION_FWD_TASK_ID, binding}; -} - -OpTaskInvocation backward(RepartitionAttrs const &attrs) { - OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::REPARTITION_BWD_TASK_ID, binding}; -} - -static DeviceSpecificDeviceStates - init_task_impl(TaskArgumentAccessor const &acc) { - auto input = acc.get_tensor(INPUT); - PerDeviceFFHandle handle = acc.get_argument(HANDLE); - - // Note: use the input data type - - RepartitionPerDeviceState per_device_state = - init_kernel(handle, input.data_type); - return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - - return profile(forward_kernel, - profiling, - "[Reparition/Partition] forward_time = {:.2lf}ms\n", - per_device_state, - input, - output); -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); - auto output_grad = acc.get_tensor_grad(INPUT); - auto input_grad = acc.get_tensor_grad(OUTPUT); - - return profile(backward_kernel, - profiling, - "[Reparition/Partition] backward_time = {:.2lf}ms\n", - per_device_state, - output_grad, - input_grad); -} - -TaskImplFunction get_repartition_init_task_impl() { - return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; -} -TaskImplFunction get_repartition_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} -TaskImplFunction get_repartition_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -OpTaskSignature get_repartition_init_signature() { - OpTaskSignature init(OpTaskType::INIT); - - init.add_unchecked_arg_slot(HANDLE); - init.add_input_slot(INPUT); - init.add_return_value(); - return init; -} -OpTaskSignature get_repartition_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_input_slot(INPUT); - fwd.add_output_slot(OUTPUT); - fwd.add_arg_slot(PROFILING); - fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); - return fwd; -} -OpTaskSignature get_repartition_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_repartition_fwd_signature()); - return bwd; -} - -std::vector get_task_ids(RepartitionAttrs const &) { - return {task_id_t::REPARTITION_INIT_TASK_ID, - task_id_t::REPARTITION_FWD_TASK_ID, - task_id_t::REPARTITION_BWD_TASK_ID}; -} - -}; // namespace FlexFlow diff --git a/lib/task-spec/src/task-spec/ops/replicate.cc b/lib/task-spec/src/task-spec/ops/replicate.cc deleted file mode 100644 index e91414bc16..0000000000 --- a/lib/task-spec/src/task-spec/ops/replicate.cc +++ /dev/null @@ -1,99 +0,0 @@ -/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "task-spec/ops/replicate.h" -#include "kernels/replicate_kernels.h" -#include "op-attrs/parallel_tensor_shape.h" -#include "utils/exception.h" -#include "utils/hash-utils.h" - -namespace FlexFlow { - -using namespace FlexFlow::Kernels::Replicate; - -enum Slots { INPUT, OUTPUT, ATTRS, PROFILING }; - -OpTaskInvocation forward(ReplicateAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind_arg(PROFILING, profiling_settings()); - - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - binding.bind_arg(ATTRS, attrs); - - return {task_id_t::REPLICATE_FWD_TASK_ID, binding}; -} -OpTaskInvocation backward(ReplicateAttrs const &attrs) { - OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - - return {task_id_t::REPLICATE_BWD_TASK_ID, binding}; -} - -static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - - auto input = acc.get_tensor(INPUT); - auto output = acc.get_tensor(OUTPUT); - - return profile(forward_kernel, - profiling, - "[replicate] forward_time = {:.2lf}ms\n", - input, - output); -} - -static std::optional - backward_task_impl(TaskArgumentAccessor const &acc) { - ProfilingSettings profiling = acc.get_argument(PROFILING); - - auto input_grad = acc.get_tensor_grad(INPUT); - auto output_grad = acc.get_tensor_grad(OUTPUT); - auto attrs = acc.get_argument(ATTRS); - - return profile(backward_kernel, - profiling, - "[replicate] backward_time = {:.2lf}ms\n", - output_grad, - input_grad, - attrs.replicate_degree.int_from_positive_int()); -} - -TaskImplFunction get_replicate_fwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; -} -TaskImplFunction get_replicate_bwd_task_impl() { - return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; -} - -OpTaskSignature get_replicate_fwd_signature() { - OpTaskSignature fwd(OpTaskType::FWD); - - fwd.add_arg_slot(PROFILING); - fwd.add_input_slot(INPUT); - fwd.add_output_slot(OUTPUT); - return fwd; -} - -OpTaskSignature get_replicate_bwd_signature() { - OpTaskSignature bwd = infer_bwd_signature(get_replicate_fwd_signature()); - return bwd; -} - -std::vector get_task_ids(ReplicateAttrs const &) { - return {task_id_t::REPLICATE_FWD_TASK_ID, task_id_t::REPLICATE_BWD_TASK_ID}; -} - -}; // namespace FlexFlow diff --git a/lib/task-spec/src/task-spec/ops/reshape.cc b/lib/task-spec/src/task-spec/ops/reshape.cc index 0b43f3e31f..b6d8cabd82 100644 --- a/lib/task-spec/src/task-spec/ops/reshape.cc +++ b/lib/task-spec/src/task-spec/ops/reshape.cc @@ -15,118 +15,98 @@ #include "task-spec/ops/reshape.h" #include "kernels/reshape_kernels.h" +#include "task-spec/profiling.h" namespace FlexFlow { using namespace FlexFlow::Kernels::Reshape; -enum slots { INPUT, OUTPUT, ATTRS, PROFILING, PER_DEVICE_STATE }; - -OpTaskInvocation init(ReshapeAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind_arg(ATTRS, attrs); - - return {task_id_t::RESHAPE_INIT_TASK_ID, binding}; -} +enum Slots { INPUT, OUTPUT, ATTRS, PROFILING, KERNEL_DEVICE_TYPE }; OpTaskInvocation forward(ReshapeAttrs const &attrs) { OpTaskBinding binding; - binding.bind_arg(PER_DEVICE_STATE, - per_device_op_state()); binding.bind_arg(PROFILING, profiling_settings()); + binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); + binding.bind_arg(ATTRS, attrs); - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - return {task_id_t::RESHAPE_FWD_TASK_ID, binding}; + binding.bind(INPUT, input_tensor(0_n)); + binding.bind(OUTPUT, output_tensor(0_n)); + return OpTaskInvocation{ + task_id_t::RESHAPE_FWD_TASK_ID, + binding, + }; } OpTaskInvocation backward(ReshapeAttrs const &attrs) { OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - return {task_id_t::RESHAPE_BWD_TASK_ID, binding}; -} - -static DeviceSpecificDeviceStates - init_task_impl(TaskArgumentAccessor const &acc) { - auto attrs = acc.get_argument(ATTRS); - - ReshapePerDeviceState per_device_state = init_kernel(attrs.shape.data_type); - return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; + return OpTaskInvocation{ + task_id_t::RESHAPE_BWD_TASK_ID, + binding, + }; } static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); + ReshapeAttrs attrs = acc.get_argument(ATTRS); auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); return profile(forward_kernel, profiling, + kernel_device_type, "[Reshape] forward time = {:.2lf}ms\n", - per_device_state, input, output); } static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); + ReshapeAttrs attrs = acc.get_argument(ATTRS); auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); return profile(backward_kernel, profiling, + kernel_device_type, "[Reshape] backward time = {:.2lf}ms\n", - per_device_state, output_grad, input_grad); } -TaskImplFunction get_reshape_init_task_impl() { - return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; -} TaskImplFunction get_reshape_fwd_task_impl() { return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } + TaskImplFunction get_reshape_bwd_task_impl() { return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } -OpTaskSignature get_reshape_init_signature() { - OpTaskSignature init(OpTaskType::INIT); - - init.add_arg_slot(ATTRS); - - init.add_return_value(); - return init; -} OpTaskSignature get_reshape_fwd_signature() { OpTaskSignature fwd(OpTaskType::FWD); fwd.add_arg_slot(PROFILING); - fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); return fwd; } + OpTaskSignature get_reshape_bwd_signature() { OpTaskSignature bwd = infer_bwd_signature(get_reshape_fwd_signature()); return bwd; } std::vector get_task_ids(ReshapeAttrs const &) { - return {task_id_t::RESHAPE_INIT_TASK_ID, - task_id_t::RESHAPE_FWD_TASK_ID, - task_id_t::RESHAPE_BWD_TASK_ID}; + return {task_id_t::RESHAPE_FWD_TASK_ID, task_id_t::RESHAPE_BWD_TASK_ID}; } }; // namespace FlexFlow diff --git a/lib/task-spec/src/task-spec/ops/reverse.cc b/lib/task-spec/src/task-spec/ops/reverse.cc index 41739d086e..9d1a8e1753 100644 --- a/lib/task-spec/src/task-spec/ops/reverse.cc +++ b/lib/task-spec/src/task-spec/ops/reverse.cc @@ -16,6 +16,7 @@ #include "task-spec/ops/reverse.h" #include "kernels/accessor.h" #include "kernels/reverse_kernels.h" +#include "task-spec/profiling.h" #include "utils/nonnegative_int/nonnegative_range.h" namespace FlexFlow { @@ -23,33 +24,43 @@ namespace FlexFlow { using namespace FlexFlow::Kernels::Reverse; using coord_t = long long; -enum Slots { INPUT, OUTPUT, ATTRS, PROFILING }; +enum Slots { INPUT, OUTPUT, ATTRS, PROFILING, KERNEL_DEVICE_TYPE }; OpTaskInvocation forward(ReverseAttrs const &attrs) { OpTaskBinding binding; binding.bind_arg(PROFILING, profiling_settings()); + binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); binding.bind_arg(ATTRS, attrs); - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); + binding.bind(INPUT, input_tensor(0_n)); + binding.bind(OUTPUT, output_tensor(0_n)); - return {task_id_t::REVERSE_FWD_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::REVERSE_FWD_TASK_ID, + binding, + }; } OpTaskInvocation backward(ReverseAttrs const &attrs) { OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - return {task_id_t::REVERSE_BWD_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::REVERSE_BWD_TASK_ID, + binding, + }; } static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); auto attrs = acc.get_argument(ATTRS); return profile(forward_kernel, profiling, + kernel_device_type, "[reverse] forward_time = {:.2lf}ms\n", input, output, @@ -59,12 +70,15 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); auto attrs = acc.get_argument(ATTRS); return profile(backward_kernel, profiling, + kernel_device_type, "[reverse] backward_time = {:.2lf}ms\n", output_grad, input_grad, @@ -82,6 +96,7 @@ OpTaskSignature get_reverse_fwd_signature() { OpTaskSignature fwd(OpTaskType::FWD); fwd.add_arg_slot(PROFILING); + fwd.add_arg_slot(KERNEL_DEVICE_TYPE); fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); return fwd; diff --git a/lib/task-spec/src/task-spec/ops/softmax.cc b/lib/task-spec/src/task-spec/ops/softmax.cc index 81239d1a67..89ea42299f 100644 --- a/lib/task-spec/src/task-spec/ops/softmax.cc +++ b/lib/task-spec/src/task-spec/ops/softmax.cc @@ -16,75 +16,104 @@ #include "task-spec/ops/softmax.h" #include "kernels/softmax_kernels.h" #include "op-attrs/parallel_tensor_shape.h" +#include "task-spec/profiling.h" #include "utils/exception.h" #include "utils/hash-utils.h" namespace FlexFlow { using namespace FlexFlow::Kernels::Softmax; -enum Slots { INPUT, OUTPUT, ATTRS, PROFILING, PER_DEVICE_STATE, HANDLE }; +enum Slots { + INPUT, + OUTPUT, + ATTRS, + PROFILING, + PER_DEVICE_STATE, + HANDLE, + KERNEL_DEVICE_TYPE +}; OpTaskInvocation init(SoftmaxAttrs const &attrs) { OpTaskBinding binding; binding.bind_arg(HANDLE, ff_handle()); binding.bind_arg(ATTRS, attrs); - return {task_id_t::SOFTMAX_INIT_TASK_ID, binding}; + binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); + + return OpTaskInvocation{ + task_id_t::SOFTMAX_INIT_TASK_ID, + binding, + }; } OpTaskInvocation forward(SoftmaxAttrs const &attrs) { OpTaskBinding binding; binding.bind_arg(PER_DEVICE_STATE, - per_device_op_state()); + per_device_op_state>()); binding.bind_arg(PROFILING, profiling_settings()); + binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); + binding.bind(INPUT, input_tensor(0_n)); + binding.bind(OUTPUT, output_tensor(0_n)); - return {task_id_t::SOFTMAX_FWD_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::SOFTMAX_FWD_TASK_ID, + binding, + }; } OpTaskInvocation backward(SoftmaxAttrs const &attrs) { OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - return {task_id_t::SOFTMAX_BWD_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::SOFTMAX_BWD_TASK_ID, + binding, + }; } static DeviceSpecificDeviceStates init_task_impl(TaskArgumentAccessor const &acc) { - PerDeviceFFHandle handle = acc.get_argument(HANDLE); + device_handle_t handle = acc.get_argument(HANDLE); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto output = acc.get_tensor(OUTPUT); auto const &attrs = acc.get_argument(ATTRS); - positive_int output_w = output.shape.at(legion_dim_t{0_n}); - positive_int output_h = output.shape.at(legion_dim_t{1_n}); - positive_int output_c = output.shape.at(legion_dim_t{2_n}); - positive_int output_n = output.shape.at(legion_dim_t{3_n}); + positive_int output_w = dim_at_idx(output.shape.dims, legion_dim_t{0_n}); + positive_int output_h = dim_at_idx(output.shape.dims, legion_dim_t{1_n}); + positive_int output_c = dim_at_idx(output.shape.dims, legion_dim_t{2_n}); + positive_int output_n = dim_at_idx(output.shape.dims, legion_dim_t{3_n}); - SoftmaxPerDeviceState per_device_state = - init_kernel(handle, - attrs.dim.value.unwrap_nonnegative(), + std::optional per_device_state = + init_kernel(kernel_device_type, + handle, + attrs.dim, output_n.int_from_positive_int(), output_c.int_from_positive_int(), output_h.int_from_positive_int(), output_w.int_from_positive_int()); return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; + DeviceSpecific>::create( + per_device_state), + }; } static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto per_device_state = acc.get_argument(PER_DEVICE_STATE); return profile(forward_kernel, profiling, - "[SoftMax] forward_time = {:.2lf}ms\n", + kernel_device_type, + "[Softmax] forward_time = {:.2lf}ms\n", per_device_state, input.get_float_ptr(), output.get_float_ptr()); @@ -93,6 +122,8 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto input_grad = acc.get_tensor_grad(INPUT); auto input = acc.get_tensor(INPUT); @@ -103,20 +134,24 @@ static std::optional assert(output_grad.shape == output.shape); - return profile(backward_kernel, - profiling, - "[SoftMax] backward_time = {:.2lf}ms\n", - output_grad.get_float_ptr(), - input_grad.get_float_ptr(), - output_grad.shape.num_elements().int_from_positive_int()); + return profile( + backward_kernel, + profiling, + kernel_device_type, + "[Softmax] backward_time = {:.2lf}ms\n", + output_grad.get_float_ptr(), + input_grad.get_float_ptr(), + get_num_elements(output_grad.shape.dims).int_from_positive_int()); } TaskImplFunction get_softmax_init_task_impl() { return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; } + TaskImplFunction get_softmax_fwd_task_impl() { return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } + TaskImplFunction get_softmax_bwd_task_impl() { return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } @@ -124,21 +159,25 @@ TaskImplFunction get_softmax_bwd_task_impl() { OpTaskSignature get_softmax_init_signature() { OpTaskSignature init(OpTaskType::INIT); - init.add_unchecked_arg_slot(HANDLE); + init.add_unchecked_arg_slot(HANDLE); init.add_arg_slot(ATTRS); + init.add_arg_slot(KERNEL_DEVICE_TYPE); init.add_return_value(); return init; } + OpTaskSignature get_softmax_fwd_signature() { OpTaskSignature fwd(OpTaskType::FWD); fwd.add_arg_slot(PROFILING); + fwd.add_arg_slot(KERNEL_DEVICE_TYPE); fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); return fwd; } + OpTaskSignature get_softmax_bwd_signature() { OpTaskSignature bwd = infer_bwd_signature(get_softmax_fwd_signature()); return bwd; diff --git a/lib/task-spec/src/task-spec/ops/split.cc b/lib/task-spec/src/task-spec/ops/split.cc index 145a9b58a3..88c16be57c 100644 --- a/lib/task-spec/src/task-spec/ops/split.cc +++ b/lib/task-spec/src/task-spec/ops/split.cc @@ -14,8 +14,8 @@ */ #include "task-spec/ops/split.h" -#include "kernels/array_shape.h" #include "kernels/split_kernels.h" +#include "task-spec/profiling.h" #include "utils/exception.h" #include "utils/hash-utils.h" #include "utils/nonnegative_int/nonnegative_range.h" @@ -23,37 +23,45 @@ namespace FlexFlow { using namespace FlexFlow::Kernels::Split; -using coord_t = long long; -enum Slots { INPUT, OUTPUT, ATTRS, PROFILING }; +enum Slots { INPUT, OUTPUT, ATTRS, PROFILING, KERNEL_DEVICE_TYPE }; OpTaskInvocation forward(SplitAttrs const &attrs) { OpTaskBinding binding; binding.bind_arg(PROFILING, profiling_settings()); binding.bind_arg(ATTRS, attrs); - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); + binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); - return {task_id_t::SPLIT_FWD_TASK_ID, binding}; + binding.bind(INPUT, input_tensor(0_n)); + binding.bind(OUTPUT, output_tensor(0_n)); + + return OpTaskInvocation{ + task_id_t::SPLIT_FWD_TASK_ID, + binding, + }; } OpTaskInvocation backward(SplitAttrs const &attrs) { OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - return {task_id_t::SPLIT_BWD_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::SPLIT_BWD_TASK_ID, + binding, + }; } static std::pair - calc_block_size(ArrayShape const &array_shape, ff_dim_t axis) { + calc_block_size(TensorShape const &tensor_shape, ff_dim_t axis) { positive_int num_blocks = 1_p; positive_int block_size = 1_p; - for (nonnegative_int d : nonnegative_range( - array_shape.num_elements().nonnegative_int_from_positive_int())) { + for (nonnegative_int d : + nonnegative_range(get_num_elements(tensor_shape.dims) + .nonnegative_int_from_positive_int())) { if (d <= axis.value) { - block_size *= array_shape.at(legion_dim_t{d}); + block_size *= dim_at_idx(tensor_shape.dims, legion_dim_t{d}); } else { - num_blocks *= array_shape.at(legion_dim_t{d}); + num_blocks *= dim_at_idx(tensor_shape.dims, legion_dim_t{d}); } } return {num_blocks, block_size}; @@ -61,11 +69,13 @@ static std::pair static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); auto attrs = acc.get_argument(ATTRS); - coord_t out_block_sizes[MAX_NUM_OUTPUTS]; + int out_block_sizes[MAX_NUM_OUTPUTS]; auto [num_blocks, in_block_size] = calc_block_size(input.shape, attrs.axis); for (int i = 0; i < attrs.splits.size(); i++) { @@ -75,7 +85,8 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { float *output_float_ptr = output.get_float_ptr(); return profile(forward_kernel, profiling, - "Split forward_time = {:.2lf}ms\n", + kernel_device_type, + "[Split] forward_time = {:.2lf}ms\n", &output_float_ptr, input.get_float_ptr(), out_block_sizes, @@ -88,23 +99,26 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); auto attrs = acc.get_argument(ATTRS); - coord_t out_block_sizes[MAX_NUM_OUTPUTS]; + int out_block_sizes[MAX_NUM_OUTPUTS]; auto [num_blocks, in_block_size] = calc_block_size(input_grad.shape, attrs.axis); for (int i = 0; i < attrs.splits.size(); i++) { - coord_t out_num_blocks; + int out_num_blocks; auto [_, out_block_size] = calc_block_size(output_grad.shape, attrs.axis); out_block_sizes[i] = out_block_size.int_from_positive_int(); } float const *output_grad_ptr = output_grad.get_float_ptr(); return profile(backward_kernel, profiling, - "Split backward_time = {:.2lf}ms\n", + kernel_device_type, + "[Split] backward_time = {:.2lf}ms\n", input_grad.get_float_ptr(), &output_grad_ptr, out_block_sizes, diff --git a/lib/task-spec/src/task-spec/ops/topk.cc b/lib/task-spec/src/task-spec/ops/topk.cc index bdf92d8487..8ff275dac3 100644 --- a/lib/task-spec/src/task-spec/ops/topk.cc +++ b/lib/task-spec/src/task-spec/ops/topk.cc @@ -15,6 +15,7 @@ #include "task-spec/ops/topk.h" #include "kernels/topk_kernels.h" +#include "task-spec/profiling.h" #include "utils/exception.h" namespace FlexFlow { @@ -25,63 +26,52 @@ using namespace FlexFlow::Kernels::TopK; // (resp. vector along the last dimension). Thus, // values.shape = indices.shape = input.shape[:-1] + [k] -enum Slots { INPUT, OUTPUT, INDICES, ATTRS, PROFILING, PER_DEVICE_STATE }; - -OpTaskInvocation init(TopKAttrs const &attrs) { - OpTaskBinding binding; - - binding.bind_arg(ATTRS, attrs); - - return {task_id_t::TOPK_INIT_TASK_ID, binding}; -} +enum Slots { INPUT, OUTPUT, INDICES, ATTRS, PROFILING, KERNEL_DEVICE_TYPE }; OpTaskInvocation forward(TopKAttrs const &attrs) { OpTaskBinding binding; - binding.bind_arg(PER_DEVICE_STATE, per_device_op_state()); binding.bind_arg(PROFILING, profiling_settings()); binding.bind_arg(ATTRS, attrs); + binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); - binding.bind(INDICES, output_tensor(1)); + binding.bind(INPUT, input_tensor(0_n)); + binding.bind(OUTPUT, output_tensor(0_n)); + binding.bind(INDICES, output_tensor(1_n)); - return {task_id_t::TOPK_FWD_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::TOPK_FWD_TASK_ID, + binding, + }; } OpTaskInvocation backward(TopKAttrs const &attrs) { OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - return {task_id_t::TOPK_BWD_TASK_ID, binding}; -} - -static DeviceSpecificDeviceStates - init_task_impl(TaskArgumentAccessor const &acc) { - - auto attrs = acc.get_argument(ATTRS); - - TopKPerDeviceState per_device_state = init_kernel(attrs.sorted); - return DeviceSpecificDeviceStates{ - DeviceSpecific::create(per_device_state)}; + return OpTaskInvocation{ + task_id_t::TOPK_BWD_TASK_ID, + binding, + }; } static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { auto attrs = acc.get_argument(ATTRS); - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); auto profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); - positive_int length = input.shape.at(legion_dim_t{0_n}); - positive_int batch_size = positive_int{input.shape.num_elements() / length}; + positive_int length = dim_at_idx(input.shape.dims, legion_dim_t{0_n}); + positive_int batch_size = + positive_int{get_num_elements(input.shape.dims) / length}; auto indices = acc.get_tensor(INDICES); return profile(forward_kernel, profiling, + kernel_device_type, "[TopK] forward_time = {:.2lf}ms\n", - per_device_state, input.get_float_ptr(), output.get_float_ptr(), indices.get_int32_ptr(), @@ -94,23 +84,23 @@ static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { auto attrs = acc.get_argument(ATTRS); - auto per_device_state = - acc.get_argument(PER_DEVICE_STATE); auto profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); auto indices = acc.get_tensor(INDICES); - positive_int length = input_grad.shape.at(legion_dim_t{0_n}); + positive_int length = dim_at_idx(input_grad.shape.dims, legion_dim_t{0_n}); positive_int batch_size = - positive_int{input_grad.shape.num_elements() / length}; + positive_int{get_num_elements(input_grad.shape.dims) / length}; return profile(backward_kernel, profiling, + kernel_device_type, "[TopK] backward_time = {:.2lf}ms\n", - per_device_state, output_grad.get_float_ptr(), indices.get_int32_ptr(), input_grad.get_float_ptr(), @@ -119,45 +109,34 @@ static std::optional attrs.k.int_from_positive_int()); } -TaskImplFunction get_topk_init_task_impl() { - return TaskImplFunction{InitOpTaskImplFunction{init_task_impl}}; -} TaskImplFunction get_topk_fwd_task_impl() { return TaskImplFunction{FwdBwdOpTaskImplFunction{forward_task_impl}}; } + TaskImplFunction get_topk_bwd_task_impl() { return TaskImplFunction{FwdBwdOpTaskImplFunction{backward_task_impl}}; } -OpTaskSignature get_topk_init_signature() { - OpTaskSignature init(OpTaskType::INIT); - - init.add_arg_slot(ATTRS); - init.add_return_value(); - - return init; -} OpTaskSignature get_topk_fwd_signature() { OpTaskSignature fwd(OpTaskType::FWD); fwd.add_arg_slot(PROFILING); fwd.add_arg_slot(ATTRS); - fwd.add_unchecked_arg_slot(PER_DEVICE_STATE); + fwd.add_arg_slot(KERNEL_DEVICE_TYPE); fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); fwd.add_output_slot(INDICES); return fwd; } + OpTaskSignature get_topk_bwd_signature() { OpTaskSignature bwd = infer_bwd_signature(get_topk_fwd_signature()); return bwd; } std::vector get_task_ids(TopKAttrs const &) { - return {task_id_t::TOPK_INIT_TASK_ID, - task_id_t::TOPK_FWD_TASK_ID, - task_id_t::TOPK_BWD_TASK_ID}; + return {task_id_t::TOPK_FWD_TASK_ID, task_id_t::TOPK_BWD_TASK_ID}; } }; // namespace FlexFlow diff --git a/lib/task-spec/src/task-spec/ops/transpose.cc b/lib/task-spec/src/task-spec/ops/transpose.cc index b6a69b0ed7..b2f94b6484 100644 --- a/lib/task-spec/src/task-spec/ops/transpose.cc +++ b/lib/task-spec/src/task-spec/ops/transpose.cc @@ -16,6 +16,7 @@ #include "task-spec/ops/transpose.h" #include "kernels/transpose_kernels.h" #include "op-attrs/ops/transpose.h" +#include "task-spec/profiling.h" #include "utils/integer_conversions.h" using namespace FlexFlow::Kernels::Transpose; @@ -23,32 +24,40 @@ using namespace FlexFlow::Kernels::Transpose; namespace FlexFlow { enum Slots { - INPUT, // tensor - OUTPUT, // tensor + INPUT, + OUTPUT, ATTRS, PROFILING, + KERNEL_DEVICE_TYPE, }; OpTaskInvocation forward(TransposeAttrs const &attrs) { OpTaskBinding binding; binding.bind_arg(PROFILING, profiling_settings()); + binding.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); - binding.bind(INPUT, input_tensor(0)); - binding.bind(OUTPUT, output_tensor(0)); + binding.bind(INPUT, input_tensor(0_n)); + binding.bind(OUTPUT, output_tensor(0_n)); - return {task_id_t::TRANSPOSE_FWD_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::TRANSPOSE_FWD_TASK_ID, + binding, + }; } static std::optional forward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto attrs = acc.get_argument(ATTRS); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto input = acc.get_tensor(INPUT); auto output = acc.get_tensor(OUTPUT); return profile(forward_kernel, profiling, + kernel_device_type, "[Transpose] Forward_time = {:.2lf} [ms]", attrs, input, @@ -59,12 +68,15 @@ static std::optional backward_task_impl(TaskArgumentAccessor const &acc) { ProfilingSettings profiling = acc.get_argument(PROFILING); auto attrs = acc.get_argument(ATTRS); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); auto input_grad = acc.get_tensor_grad(INPUT); auto output_grad = acc.get_tensor_grad(OUTPUT); return profile(backward_kernel, profiling, + kernel_device_type, "[Transpose] Backward_time = {:.2lf} [ms]", attrs, output_grad, @@ -74,7 +86,10 @@ static std::optional OpTaskInvocation backward(TransposeAttrs const &attrs) { OpTaskBinding binding = infer_bwd_binding(forward(attrs).binding); - return {task_id_t::TRANSPOSE_BWD_TASK_ID, binding}; + return OpTaskInvocation{ + task_id_t::TRANSPOSE_BWD_TASK_ID, + binding, + }; } TaskImplFunction get_transpose_fwd_task_impl() { @@ -89,6 +104,7 @@ OpTaskSignature get_transpose_fwd_signature() { OpTaskSignature fwd(OpTaskType::FWD); fwd.add_arg_slot(PROFILING); + fwd.add_arg_slot(KERNEL_DEVICE_TYPE); fwd.add_input_slot(INPUT); fwd.add_output_slot(OUTPUT); diff --git a/lib/local-execution/src/optimizer.cc b/lib/task-spec/src/task-spec/optimizer.cc similarity index 76% rename from lib/local-execution/src/optimizer.cc rename to lib/task-spec/src/task-spec/optimizer.cc index 1d65172e67..c8fa23c2af 100644 --- a/lib/local-execution/src/optimizer.cc +++ b/lib/task-spec/src/task-spec/optimizer.cc @@ -1,4 +1,4 @@ -#include "local-execution/optimizer.h" +#include "task-spec/optimizer.h" #include "kernels/optimizer_kernels.h" #include "task-spec/profiling.h" #include "utils/containers/get_only.h" @@ -14,7 +14,8 @@ enum Slots { PROFILING, ADAM_M, ADAM_V, - HANDLE + HANDLE, + KERNEL_DEVICE_TYPE, }; TaskSignature get_sgd_update_signature() { @@ -25,6 +26,7 @@ TaskSignature get_sgd_update_signature() { add_arg_slot(sig, ATTRS); add_arg_slot(sig, PROFILING); + add_arg_slot(sig, KERNEL_DEVICE_TYPE); add_unchecked_arg_slot( sig, HANDLE); // how to deal with removal of ParamSync? @@ -35,9 +37,9 @@ TaskSignature get_sgd_update_signature() { } TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs, - tensor_guid_t const &weight, - gradient_tensor_t const &weight_grad, - optimizer_tensor_t const &sgd_v) { + forward_tensor_guid_t const &weight, + gradient_tensor_guid_t const &weight_grad, + optimizer_tensor_guid_t const &sgd_v) { TaskBinding b; b.bind(WEIGHT, weight); b.bind_grad(WEIGHT_GRAD, weight_grad); @@ -47,6 +49,7 @@ TaskInvocation sgd_update(SGDOptimizerAttrs const &attrs, } b.bind_arg(ATTRS, attrs); b.bind_arg(PROFILING, profiling_settings()); + b.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); b.bind_arg(HANDLE, ff_handle()); return TaskInvocation{task_id_t::SGD_UPD_NCCL_TASK_ID, @@ -65,35 +68,38 @@ static void sgd_update_task_impl(TaskArgumentAccessor const &acc) { auto weight_grad = acc.get_tensor_grad(WEIGHT_GRAD); auto weight = acc.get_tensor(WEIGHT); auto profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); ASSERT(weight.shape == weight_grad.shape); - int size = weight_grad.shape.num_elements().int_from_positive_int(); - ASSERT(weight_grad.shape.num_elements().int_from_positive_int() & - weight.shape.num_elements().int_from_positive_int()); - int num_replicas = weight_grad.shape.num_elements().int_from_positive_int() / - weight.shape.num_elements().int_from_positive_int(); + ASSERT(get_num_elements(weight_grad.shape.dims).int_from_positive_int() % + get_num_elements(weight.shape.dims).int_from_positive_int() == + 0); + int num_replicas = + get_num_elements(weight_grad.shape.dims).int_from_positive_int() / + get_num_elements(weight.shape.dims).int_from_positive_int(); - float *sgd_v_ptr; + std::optional sgd_v = std::nullopt; if (attrs.momentum > 0.0f) { - auto sgd_v = acc.get_optimizer_tensor(SGD_V); - ASSERT(sgd_v.shape == weight.shape); - sgd_v_ptr = sgd_v.get_float_ptr(); + sgd_v = acc.get_optimizer_tensor(SGD_V); + ASSERT(sgd_v.value().shape == weight.shape); } - auto handle = acc.get_argument(HANDLE); - profile(sgd_nccl_update_task_gpu, + auto handle = acc.get_argument(HANDLE); + profile(sgd_update_task, profiling, - "[SGD NCCL] update_time = %.2lfms\n", + kernel_device_type, + "[SGD] update_time = %.2lfms\n", + handle, attrs.lr, attrs.momentum, attrs.nesterov, attrs.weight_decay, - handle, - weight_grad.get_float_ptr(), - size, - weight.get_float_ptr(), - sgd_v_ptr); // how to deal with removal of ParamSync? + weight_grad, + num_replicas, + weight, + sgd_v); // how to deal with removal of ParamSync? // if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { // auto handle = acc.get_argument(HANDLE); @@ -139,6 +145,7 @@ TaskSignature get_adam_update_signature() { add_arg_slot(sig, ATTRS); add_arg_slot(sig, PROFILING); + add_arg_slot(sig, KERNEL_DEVICE_TYPE); add_unchecked_arg_slot( sig, HANDLE); // how to deal with removal of ParamSync? // if (CHOSEN_SYNC_TYPE == ParamSync::NCCL) { @@ -148,10 +155,10 @@ TaskSignature get_adam_update_signature() { } TaskInvocation adam_update(AdamOptimizerAttrs const &attrs, - tensor_guid_t const &weight, - gradient_tensor_t const &weight_grad, - optimizer_tensor_t const &adam_v, - optimizer_tensor_t const &adam_m) { + forward_tensor_guid_t const &weight, + gradient_tensor_guid_t const &weight_grad, + optimizer_tensor_guid_t const &adam_v, + optimizer_tensor_guid_t const &adam_m) { TaskBinding b; b.bind(WEIGHT, weight); b.bind_grad(WEIGHT_GRAD, weight_grad); @@ -159,6 +166,7 @@ TaskInvocation adam_update(AdamOptimizerAttrs const &attrs, b.bind_optimizer(ADAM_V, adam_v); b.bind_arg(ATTRS, attrs); b.bind_arg(PROFILING, profiling_settings()); + b.bind_arg(KERNEL_DEVICE_TYPE, kernel_device_type()); b.bind_arg(HANDLE, ff_handle()); return TaskInvocation{task_id_t::ADAM_UPD_NCCL_TASK_ID, b}; // how to deal with removal of ParamSync? @@ -179,24 +187,33 @@ static void adam_update_task_impl(TaskArgumentAccessor const &acc) { auto m_tensor = acc.get_optimizer_tensor(ADAM_M); auto profiling = acc.get_argument(PROFILING); + DeviceType kernel_device_type = + acc.get_argument(KERNEL_DEVICE_TYPE); ASSERT(weight.shape == weight_grad.shape); - int size = weight_grad.shape.num_elements().int_from_positive_int(); + int size = get_num_elements(weight_grad.shape.dims).int_from_positive_int(); - ASSERT(weight_grad.shape.num_elements() % weight.shape.num_elements() == 0); + ASSERT(get_num_elements(weight_grad.shape.dims).int_from_positive_int() % + get_num_elements(weight.shape.dims).int_from_positive_int() == + 0); + int num_replicas = + get_num_elements(weight_grad.shape.dims).int_from_positive_int() / + get_num_elements(weight.shape.dims).int_from_positive_int(); - auto handle = acc.get_argument(HANDLE); - profile(adam_nccl_update_task_gpu, + auto handle = acc.get_argument(HANDLE); + profile(adam_update_task, profiling, + kernel_device_type, "[Adam NCCL] update_time = %.2lfms\n", + handle, attrs.alpha_t, attrs.beta1, attrs.beta2, attrs.weight_decay, attrs.epsilon, - handle, weight_grad.get_float_ptr(), size, + num_replicas, m_tensor.get_float_ptr(), v_tensor.get_float_ptr(), weight.get_float_ptr()); // how to deal with removal of ParamSync? @@ -247,9 +264,9 @@ TaskSignature get_update_signature(OptimizerAttrs const &attrs) { TaskInvocation get_update_invocation( OptimizerAttrs const &attrs, - tensor_guid_t const &weight, - gradient_tensor_t const &weight_grad, - std::vector const &grad_buffer_tensors) { + forward_tensor_guid_t const &weight, + gradient_tensor_guid_t const &weight_grad, + std::vector const &grad_buffer_tensors) { return attrs.visit( overload{[&](SGDOptimizerAttrs const &s) { return sgd_update( diff --git a/lib/local-execution/src/optimizer_tensor_source.cc b/lib/task-spec/src/task-spec/optimizer_tensor_source.cc similarity index 55% rename from lib/local-execution/src/optimizer_tensor_source.cc rename to lib/task-spec/src/task-spec/optimizer_tensor_source.cc index a1a9a2927d..ad7bf9f489 100644 --- a/lib/local-execution/src/optimizer_tensor_source.cc +++ b/lib/task-spec/src/task-spec/optimizer_tensor_source.cc @@ -1,13 +1,13 @@ -#include "local-execution/optimizer_tensor_source.h" +#include "task-spec/optimizer_tensor_source.h" namespace FlexFlow { -size_t OptimizerTensorSource::next_available_optimizer_tensor_id = 0; +int OptimizerTensorSource::next_available_optimizer_tensor_id = 0; OptimizerTensorSource::OptimizerTensorSource() {} -optimizer_tensor_t OptimizerTensorSource::new_optimizer_tensor() { - return optimizer_tensor_t{ +optimizer_tensor_guid_t OptimizerTensorSource::new_optimizer_tensor() { + return optimizer_tensor_guid_t{ OptimizerTensorSource::next_available_optimizer_tensor_id++}; } diff --git a/lib/task-spec/src/task-spec/profiling.cc b/lib/task-spec/src/task-spec/profiling.cc new file mode 100644 index 0000000000..e32a2e564c --- /dev/null +++ b/lib/task-spec/src/task-spec/profiling.cc @@ -0,0 +1 @@ +#include "task-spec/profiling.h" diff --git a/lib/task-spec/src/task-spec/runtime_arg_config.cc b/lib/task-spec/src/task-spec/runtime_arg_config.cc new file mode 100644 index 0000000000..9f3dc61545 --- /dev/null +++ b/lib/task-spec/src/task-spec/runtime_arg_config.cc @@ -0,0 +1,30 @@ +#include "task-spec/runtime_arg_config.h" +#include "kernels/device_handle_t.h" + +namespace FlexFlow { + +RuntimeArgConfig + cpu_make_runtime_arg_config(EnableProfiling enable_profiling, + ProfilingSettings profiling_settings) { + return RuntimeArgConfig{ + DeviceSpecific::create(cpu_make_device_handle_t()), + enable_profiling, + profiling_settings, + DeviceType::CPU, + }; +} + +RuntimeArgConfig + gpu_make_runtime_arg_config(PerDeviceFFHandle const &ff_handle, + EnableProfiling enable_profiling, + ProfilingSettings profiling_settings) { + return RuntimeArgConfig{ + DeviceSpecific::create( + gpu_make_device_handle_t(ff_handle)), + enable_profiling, + profiling_settings, + DeviceType::GPU, + }; +} + +} // namespace FlexFlow diff --git a/lib/task-spec/src/task-spec/runtime_arg_ref.cc b/lib/task-spec/src/task-spec/runtime_arg_ref.cc index bb4625c113..3aa1b7f907 100644 --- a/lib/task-spec/src/task-spec/runtime_arg_ref.cc +++ b/lib/task-spec/src/task-spec/runtime_arg_ref.cc @@ -1,26 +1,14 @@ #include "task-spec/runtime_arg_ref.h" +#include "kernels/device_handle_t.dtg.h" #include "task-spec/device_specific.h" namespace FlexFlow { -std::string to_string(RuntimeArgRefType const &runtime_arg_ref_type) { - switch (runtime_arg_ref_type) { - case RuntimeArgRefType::FF_HANDLE: - return "FF_HANDLE"; - case RuntimeArgRefType::PROFILING_SETTINGS: - return "PROFILING_SETTINGS"; - case RuntimeArgRefType::FF_ITERATION_CONFIG: - return "FF_ITERATION_CONFIG"; - default: - return "Unknown"; - } -} - RuntimeArgRef profiling_settings() { return {RuntimeArgRefType::PROFILING_SETTINGS}; } -RuntimeArgRef> ff_handle() { +RuntimeArgRef> ff_handle() { return {RuntimeArgRefType::FF_HANDLE}; } @@ -28,4 +16,8 @@ RuntimeArgRef iteration_config() { return {RuntimeArgRefType::FF_ITERATION_CONFIG}; } +RuntimeArgRef kernel_device_type() { + return {RuntimeArgRefType::KERNEL_DEVICE_TYPE}; +} + } // namespace FlexFlow diff --git a/lib/task-spec/src/task-spec/task_invocation.cc b/lib/task-spec/src/task-spec/task_invocation.cc index e182231bda..0677ff6e60 100644 --- a/lib/task-spec/src/task-spec/task_invocation.cc +++ b/lib/task-spec/src/task-spec/task_invocation.cc @@ -7,7 +7,6 @@ namespace FlexFlow { bool is_invocation_valid(TaskSignature const &sig, TaskInvocation const &inv) { TaskBinding binding = inv.binding; - // args for (std::pair const &arg_binding : binding.get_arg_bindings()) { if (sig.task_arg_types.count(arg_binding.first)) { @@ -20,9 +19,8 @@ bool is_invocation_valid(TaskSignature const &sig, TaskInvocation const &inv) { } } - // tensors - for (std::pair const &tensor_binding : - binding.get_tensor_bindings()) { + for (std::pair const + &tensor_binding : binding.get_tensor_bindings()) { slot_id_t tensor_slot_id = tensor_binding.first.slot_id; if (sig.tensor_guid_slots.count(tensor_slot_id)) { if (tensor_binding.first.tensor_type == diff --git a/lib/task-spec/src/task-spec/task_signature_impl.cc b/lib/task-spec/src/task-spec/task_signature_impl.cc index 7995c0af0b..8da38b5840 100644 --- a/lib/task-spec/src/task-spec/task_signature_impl.cc +++ b/lib/task-spec/src/task-spec/task_signature_impl.cc @@ -3,7 +3,6 @@ #include "task-spec/ops/batch_matmul.h" #include "task-spec/ops/batch_norm.h" #include "task-spec/ops/cast.h" -#include "task-spec/ops/combine.h" #include "task-spec/ops/concat.h" #include "task-spec/ops/conv_2d.h" #include "task-spec/ops/dropout.h" @@ -18,9 +17,6 @@ #include "task-spec/ops/noop.h" #include "task-spec/ops/pool_2d.h" #include "task-spec/ops/reduce.h" -#include "task-spec/ops/reduction.h" -#include "task-spec/ops/repartition.h" -#include "task-spec/ops/replicate.h" #include "task-spec/ops/reshape.h" #include "task-spec/ops/reverse.h" #include "task-spec/ops/softmax.h" @@ -32,7 +28,8 @@ namespace FlexFlow { -TaskSignatureAndImpl get_task_sig_impl(task_id_t const &task_id) { +TaskSignatureAndImpl + get_task_signature_and_impl_for_task_id(task_id_t const &task_id) { switch (task_id) { case task_id_t::ELEMENTBINARY_INIT_TASK_ID: return TaskSignatureAndImpl{get_element_binary_init_task_impl(), @@ -70,12 +67,12 @@ TaskSignatureAndImpl get_task_sig_impl(task_id_t const &task_id) { case task_id_t::DROPOUT_BWD_TASK_ID: return TaskSignatureAndImpl{get_dropout_bwd_task_impl(), get_dropout_bwd_signature()}; - // case task_id_t::EMBED_FWD_TASK_ID: - // return TaskSignatureAndImpl{get_embedding_fwd_task_impl(), - // get_embedding_fwd_signature()}; - // case task_id_t::EMBED_BWD_TASK_ID: - // return TaskSignatureAndImpl{get_embedding_bwd_task_impl(), - // get_embedding_bwd_signature()}; + case task_id_t::EMBED_FWD_TASK_ID: + return TaskSignatureAndImpl{get_embedding_fwd_task_impl(), + get_embedding_fwd_signature()}; + case task_id_t::EMBED_BWD_TASK_ID: + return TaskSignatureAndImpl{get_embedding_bwd_task_impl(), + get_embedding_bwd_signature()}; case task_id_t::GATHER_INIT_TASK_ID: return TaskSignatureAndImpl{get_gather_init_task_impl(), get_gather_init_signature()}; @@ -169,9 +166,6 @@ TaskSignatureAndImpl get_task_sig_impl(task_id_t const &task_id) { case task_id_t::REDUCE_BWD_TASK_ID: return TaskSignatureAndImpl{get_reduce_bwd_task_impl(), get_reduce_bwd_signature()}; - case task_id_t::RESHAPE_INIT_TASK_ID: - return TaskSignatureAndImpl{get_reshape_init_task_impl(), - get_reshape_init_signature()}; case task_id_t::RESHAPE_FWD_TASK_ID: return TaskSignatureAndImpl{get_reshape_fwd_task_impl(), get_reshape_fwd_signature()}; @@ -184,9 +178,6 @@ TaskSignatureAndImpl get_task_sig_impl(task_id_t const &task_id) { case task_id_t::REVERSE_BWD_TASK_ID: return TaskSignatureAndImpl{get_reverse_bwd_task_impl(), get_reverse_bwd_signature()}; - case task_id_t::TOPK_INIT_TASK_ID: - return TaskSignatureAndImpl{get_topk_init_task_impl(), - get_topk_init_signature()}; case task_id_t::TOPK_FWD_TASK_ID: return TaskSignatureAndImpl{get_topk_fwd_task_impl(), get_topk_fwd_signature()}; @@ -208,37 +199,8 @@ TaskSignatureAndImpl get_task_sig_impl(task_id_t const &task_id) { case task_id_t::ATTENTION_BWD_TASK_ID: return TaskSignatureAndImpl{get_attention_bwd_task_impl(), get_attention_bwd_signature()}; - case task_id_t::COMBINE_FWD_TASK_ID: - return TaskSignatureAndImpl{get_combine_fwd_task_impl(), - get_combine_fwd_signature()}; - case task_id_t::COMBINE_BWD_TASK_ID: - return TaskSignatureAndImpl{get_combine_bwd_task_impl(), - get_combine_bwd_signature()}; - case task_id_t::REDUCTION_FWD_TASK_ID: - return TaskSignatureAndImpl{get_reduction_fwd_task_impl(), - get_reduction_fwd_signature()}; - case task_id_t::REDUCTION_BWD_TASK_ID: - return TaskSignatureAndImpl{get_reduction_bwd_task_impl(), - get_reduction_bwd_signature()}; - case task_id_t::REPARTITION_INIT_TASK_ID: - return TaskSignatureAndImpl{get_repartition_init_task_impl(), - get_repartition_init_signature()}; - case task_id_t::REPARTITION_FWD_TASK_ID: - return TaskSignatureAndImpl{get_repartition_fwd_task_impl(), - get_repartition_fwd_signature()}; - case task_id_t::REPARTITION_BWD_TASK_ID: - return TaskSignatureAndImpl{get_repartition_bwd_task_impl(), - get_repartition_bwd_signature()}; - case task_id_t::REPLICATE_FWD_TASK_ID: - return TaskSignatureAndImpl{get_replicate_fwd_task_impl(), - get_replicate_fwd_signature()}; - case task_id_t::REPLICATE_BWD_TASK_ID: - return TaskSignatureAndImpl{get_replicate_bwd_task_impl(), - get_replicate_bwd_signature()}; default: - throw mk_runtime_error( - fmt::format("Invalid task ID")); // inserting task_id yields - // "type_is_unformattable" error + PANIC("Unhandled task ID", task_id); } } @@ -252,9 +214,7 @@ std::vector get_task_ids(ComputationGraphOpAttrs const &op) { [](DropoutAttrs const &attrs) { return get_task_ids(attrs); }, [](ElementBinaryAttrs const &attrs) { return get_task_ids(attrs); }, [](ElementUnaryAttrs const &attrs) { return get_task_ids(attrs); }, - // [](EmbeddingAttrs const & attrs) { - // return get_task_ids(attrs); - // }, + [](EmbeddingAttrs const &attrs) { return get_task_ids(attrs); }, [](FlatAttrs const &attrs) { return get_task_ids(attrs); }, [](GatherAttrs const &attrs) { return get_task_ids(attrs); }, [](InputAttrs const &attrs) { return get_task_ids(attrs); }, @@ -277,7 +237,8 @@ std::vector get_task_ids(ComputationGraphOpAttrs const &op) { }); } -OpTaskInvocation init(ComputationGraphOpAttrs const &op) { +OpTaskInvocation + get_init_op_task_invocation(ComputationGraphOpAttrs const &op) { return op.visit(overload{ [](BatchNormAttrs const &attrs) { return init(attrs); }, [](Conv2DAttrs const &attrs) { return init(attrs); }, @@ -290,16 +251,15 @@ OpTaskInvocation init(ComputationGraphOpAttrs const &op) { [](MultiHeadAttentionAttrs const &attrs) { return init(attrs); }, [](Pool2DAttrs const &attrs) { return init(attrs); }, [](ReduceAttrs const &attrs) { return init(attrs); }, - [](ReshapeAttrs const &attrs) { return init(attrs); }, [](SoftmaxAttrs const &attrs) { return init(attrs); }, - [](TopKAttrs const &attrs) { return init(attrs); }, [](auto const &attrs) -> OpTaskInvocation { - throw mk_runtime_error(fmt::format("Unhandled attr type {}", attrs)); + PANIC("Unhandled attr type", attrs); }, }); } -OpTaskInvocation forward(ComputationGraphOpAttrs const &op) { +OpTaskInvocation + get_forward_op_task_invocation(ComputationGraphOpAttrs const &op) { return op.visit(overload{ [](BatchMatmulAttrs const &attrs) { return forward(attrs); }, [](BatchNormAttrs const &attrs) { return forward(attrs); }, @@ -309,9 +269,7 @@ OpTaskInvocation forward(ComputationGraphOpAttrs const &op) { [](DropoutAttrs const &attrs) { return forward(attrs); }, [](ElementBinaryAttrs const &attrs) { return forward(attrs); }, [](ElementUnaryAttrs const &attrs) { return forward(attrs); }, - // [](EmbeddingAttrs const & attrs) { - // return forward(attrs); - // }, + [](EmbeddingAttrs const &attrs) { return forward(attrs); }, [](FlatAttrs const &attrs) { return forward(attrs); }, [](GatherAttrs const &attrs) { return forward(attrs); }, [](LayerNormAttrs const &attrs) { return forward(attrs); }, @@ -331,7 +289,8 @@ OpTaskInvocation forward(ComputationGraphOpAttrs const &op) { }); } -OpTaskInvocation backward(ComputationGraphOpAttrs const &op) { +OpTaskInvocation + get_backward_op_task_invocation(ComputationGraphOpAttrs const &op) { return op.visit(overload{ [](BatchMatmulAttrs const &attrs) { return backward(attrs); }, [](BatchNormAttrs const &attrs) { return backward(attrs); }, @@ -341,9 +300,7 @@ OpTaskInvocation backward(ComputationGraphOpAttrs const &op) { [](DropoutAttrs const &attrs) { return backward(attrs); }, [](ElementBinaryAttrs const &attrs) { return backward(attrs); }, [](ElementUnaryAttrs const &attrs) { return backward(attrs); }, - // [](EmbeddingAttrs const & attrs) { - // return backward(attrs); - // }, + [](EmbeddingAttrs const &attrs) { return backward(attrs); }, [](FlatAttrs const &attrs) { return backward(attrs); }, [](GatherAttrs const &attrs) { return backward(attrs); }, [](LayerNormAttrs const &attrs) { return backward(attrs); }, @@ -358,7 +315,7 @@ OpTaskInvocation backward(ComputationGraphOpAttrs const &op) { [](TopKAttrs const &attrs) { return backward(attrs); }, [](TransposeAttrs const &attrs) { return backward(attrs); }, [](auto const &attrs) -> OpTaskInvocation { - throw mk_runtime_error(fmt::format("Unhandled attr type {}", attrs)); + PANIC("Unhandled attr type", attrs); }, }); } diff --git a/lib/task-spec/src/task-spec/training_computation_graph.cc b/lib/task-spec/src/task-spec/training_computation_graph.cc new file mode 100644 index 0000000000..f50930d684 --- /dev/null +++ b/lib/task-spec/src/task-spec/training_computation_graph.cc @@ -0,0 +1,183 @@ +#include "task-spec/training_computation_graph.h" +#include "task-spec/loss_tensor_source.h" +#include "task-spec/training_tensor_group.h" +#include "task-spec/training_tensor_group_with_attrs.h" +#include "utils/containers/contains.h" +#include "utils/containers/filter_values.h" +#include "utils/containers/flatmap.h" +#include "utils/containers/generate_map.h" +#include "utils/containers/get_only.h" +#include "utils/containers/keys.h" +#include "utils/containers/set_of.h" +#include "utils/containers/transform.h" +#include "utils/overload.h" + +namespace FlexFlow { + +TrainingComputationGraph generate_training_computation_graph( + ComputationGraph const &computation_graph, + OptimizerAttrs const &optimizer_attrs, + tensor_guid_t const &logit_tensor, + ForwardTensorSource &forward_tensor_source, + GradientTensorSource &gradient_tensor_source, + OptimizerTensorSource &optimizer_tensor_source, + LossTensorSource &loss_tensor_source) { + + loss_tensor_guid_t label_tensor = loss_tensor_source.new_loss_tensor(); + + return TrainingComputationGraph{ + /*computation_graph=*/computation_graph, + /*training_tensor_group_for_tensor=*/ + transform( + get_all_tensor_attrs(computation_graph), + [&](tensor_guid_t tensor_guid, TensorAttrs const &tensor_attrs) { + return std::pair{ + tensor_guid, + make_training_tensor_group_for_tensor_guid_t( + /*tensor_guid=*/tensor_guid, + /*tensor_attrs=*/tensor_attrs, + /*optimizer_attrs=*/optimizer_attrs, + /*forward_tensor_source=*/forward_tensor_source, + /*gradient_tensor_source=*/gradient_tensor_source, + /*optimizer_tensor_source=*/optimizer_tensor_source), + }; + }), + /*logit_tensor=*/logit_tensor, + /*label_tensor=*/label_tensor, + }; +} + +TrainingTensorGroup get_training_tensor_group_for_tensor_guid( + TrainingComputationGraph const &training_cg, tensor_guid_t tensor_guid) { + + return training_cg.training_tensor_group_for_tensor.at(tensor_guid); +} + +TrainingTensorGroupWithAttrs + get_training_tensor_group_with_attrs_for_tensor_guid( + TrainingComputationGraph const &training_cg, + tensor_guid_t tensor_guid) { + return make_training_tensor_group_with_attrs_from_group_and_attrs( + /*group=*/get_training_tensor_group_for_tensor_guid(training_cg, + tensor_guid), + /*attrs=*/get_tensor_attrs(training_cg.computation_graph, tensor_guid)); +} + +forward_tensor_guid_t get_forward_tensor_guid_for_tensor_guid( + TrainingComputationGraph const &training_cg, tensor_guid_t t) { + return training_cg.training_tensor_group_for_tensor.at(t).forward_tensor; +} + +gradient_tensor_guid_t get_gradient_tensor_guid_for_tensor_guid( + TrainingComputationGraph const &training_cg, tensor_guid_t t) { + return training_cg.training_tensor_group_for_tensor.at(t).gradient_tensor; +} + +std::vector get_optimizer_tensor_guids_for_tensor_guid( + TrainingComputationGraph const &training_cg, tensor_guid_t t) { + return training_cg.training_tensor_group_for_tensor.at(t).optimizer_tensors; +} + +tensor_guid_t get_tensor_guid_for_forward_tensor_guid( + TrainingComputationGraph const &training_cg, forward_tensor_guid_t t) { + return get_only(keys(filter_values( + training_cg.training_tensor_group_for_tensor, + [&](TrainingTensorGroup const &g) { return g.forward_tensor == t; }))); +} + +tensor_guid_t get_tensor_guid_for_gradient_tensor_guid( + TrainingComputationGraph const &training_cg, gradient_tensor_guid_t t) { + return get_only(keys(filter_values( + training_cg.training_tensor_group_for_tensor, + [&](TrainingTensorGroup const &g) { return g.gradient_tensor == t; }))); +} + +tensor_guid_t get_tensor_guid_for_optimizer_tensor_guid( + TrainingComputationGraph const &training_cg, optimizer_tensor_guid_t t) { + return get_only( + keys(filter_values(training_cg.training_tensor_group_for_tensor, + [&](TrainingTensorGroup const &g) { + return contains(g.optimizer_tensors, t); + }))); +} + +tensor_guid_t get_tensor_guid_for_training_tensor_guid( + TrainingComputationGraph const &training_cg, training_tensor_guid_t t) { + return t.visit(overload{ + [&](forward_tensor_guid_t forward_tensor) { + return get_tensor_guid_for_forward_tensor_guid(training_cg, + forward_tensor); + }, + [&](gradient_tensor_guid_t gradient_tensor) { + return get_tensor_guid_for_gradient_tensor_guid(training_cg, + gradient_tensor); + }, + [&](optimizer_tensor_guid_t optimizer_tensor) { + return get_tensor_guid_for_optimizer_tensor_guid(training_cg, + optimizer_tensor); + }, + [&](loss_tensor_guid_t loss_tensor) -> tensor_guid_t { + PANIC("no tensor_guid_t can exist for a loss_tensor_guid_t"); + }, + }); +} + +std::unordered_set + get_all_training_tensors_in_training_computation_graph( + TrainingComputationGraph const &training_cg) { + std::unordered_set result = flatmap( + unordered_set_of(keys(training_cg.training_tensor_group_for_tensor)), + [&](tensor_guid_t t) { + return get_all_training_tensors_in_tensor_group( + training_cg.training_tensor_group_for_tensor.at(t)); + }); + + result.insert(training_tensor_guid_t{training_cg.label_tensor}); + return result; +} + +TrainingLayerPlusContext + get_training_layer_plus_context(TrainingComputationGraph const &training_cg, + layer_guid_t layer_guid) { + auto get_tensor_group_with_attrs = + [&](tensor_guid_t t) -> TrainingTensorGroupWithAttrs { + return get_training_tensor_group_with_attrs_for_tensor_guid(training_cg, t); + }; + + return TrainingLayerPlusContext{ + /*layer_guid=*/layer_guid, + /*layer_attrs=*/ + get_layer_attrs(training_cg.computation_graph, layer_guid), + /*input_tensor_groups=*/ + transform(get_incoming_inputs(training_cg.computation_graph, layer_guid), + get_tensor_group_with_attrs), + /*weight_tensor_groups=*/ + transform(get_incoming_weights(training_cg.computation_graph, layer_guid), + get_tensor_group_with_attrs), + /*output_tensor_groups=*/ + transform(get_outgoing_tensors(training_cg.computation_graph, layer_guid), + get_tensor_group_with_attrs), + }; +} + +std::unordered_map + get_all_training_tensor_shapes( + TrainingComputationGraph const &training_cg) { + return generate_map( + get_all_training_tensors_in_training_computation_graph(training_cg), + [&](training_tensor_guid_t t) { + if (t.is_loss_tensor()) { + ASSERT(t == training_tensor_guid_t{training_cg.label_tensor}); + return get_tensor_attrs(training_cg.computation_graph, + training_cg.logit_tensor) + .shape; + } + + return get_tensor_attrs( + training_cg.computation_graph, + get_tensor_guid_for_training_tensor_guid(training_cg, t)) + .shape; + }); +} + +} // namespace FlexFlow diff --git a/lib/task-spec/src/task-spec/training_layer_plus_context.cc b/lib/task-spec/src/task-spec/training_layer_plus_context.cc new file mode 100644 index 0000000000..9adbc6b2a1 --- /dev/null +++ b/lib/task-spec/src/task-spec/training_layer_plus_context.cc @@ -0,0 +1,122 @@ +#include "task-spec/training_layer_plus_context.h" +#include "task-spec/training_tensor_group_with_attrs.h" +#include "utils/containers/transform.h" + +namespace FlexFlow { + +std::vector + get_training_tensor_groups_with_attrs_for_role( + TrainingLayerPlusContext const &training_layer_plus_context, + TensorRole tensor_role) { + + switch (tensor_role) { + case TensorRole::INPUT: + return training_layer_plus_context.input_tensor_groups; + case TensorRole::WEIGHT: + return training_layer_plus_context.weight_tensor_groups; + case TensorRole::OUTPUT: + return training_layer_plus_context.output_tensor_groups; + default: + PANIC("Unhandled TensorRole {}", tensor_role); + } +} + +TrainingTensorGroupWithAttrs + get_training_tensor_group_with_attrs_for_role_and_index( + TrainingLayerPlusContext const &training_layer_plus_context, + TensorRole tensor_role, + nonnegative_int index) { + + return get_training_tensor_groups_with_attrs_for_role( + training_layer_plus_context, tensor_role) + .at(index.unwrap_nonnegative()); +} + +std::vector + get_input_tensors(TrainingLayerPlusContext const &l) { + return transform( + l.input_tensor_groups, + [](TrainingTensorGroupWithAttrs const &g) { return g.forward_tensor; }); +} + +std::vector + get_input_grad_tensors(TrainingLayerPlusContext const &l) { + return transform( + l.input_tensor_groups, + [](TrainingTensorGroupWithAttrs const &g) { return g.gradient_tensor; }); +} + +std::vector + get_input_tensor_shapes(TrainingLayerPlusContext const &l) { + return transform(l.input_tensor_groups, + [](TrainingTensorGroupWithAttrs const &g) { + return g.tensor_attrs.shape; + }); +} + +std::vector + get_weight_tensors(TrainingLayerPlusContext const &l) { + return transform( + l.weight_tensor_groups, + [](TrainingTensorGroupWithAttrs const &g) { return g.forward_tensor; }); +} + +std::vector + get_weight_grad_tensors(TrainingLayerPlusContext const &l) { + return transform( + l.weight_tensor_groups, + [](TrainingTensorGroupWithAttrs const &g) { return g.gradient_tensor; }); +} + +std::vector + get_weight_tensor_shapes(TrainingLayerPlusContext const &l) { + return transform(l.weight_tensor_groups, + [](TrainingTensorGroupWithAttrs const &g) { + return g.tensor_attrs.shape; + }); +} + +std::vector + get_output_tensors(TrainingLayerPlusContext const &l) { + return transform( + l.output_tensor_groups, + [](TrainingTensorGroupWithAttrs const &g) { return g.forward_tensor; }); +} + +std::vector + get_output_grad_tensors(TrainingLayerPlusContext const &l) { + return transform( + l.output_tensor_groups, + [](TrainingTensorGroupWithAttrs const &g) { return g.gradient_tensor; }); +} + +std::vector + get_output_tensor_shapes(TrainingLayerPlusContext const &l) { + return transform(l.output_tensor_groups, + [](TrainingTensorGroupWithAttrs const &g) { + return g.tensor_attrs.shape; + }); +} + +TrainingLayerTensorGroupSignature + get_tensor_group_signature(TrainingLayerPlusContext const &l) { + return TrainingLayerTensorGroupSignature{ + /*input_tensor_groups=*/transform(l.input_tensor_groups, + tensor_group_without_attrs), + /*weight_tensor_groups=*/ + transform(l.weight_tensor_groups, tensor_group_without_attrs), + /*output_tensor_groups=*/ + transform(l.output_tensor_groups, tensor_group_without_attrs), + }; +} + +CGOperatorTensorShapeSignature + get_cg_op_shape_signature(TrainingLayerPlusContext const &l) { + return CGOperatorTensorShapeSignature{ + /*input_shapes=*/get_input_tensor_shapes(l), + /*weight_shapes=*/get_weight_tensor_shapes(l), + /*output_shapes=*/get_output_tensor_shapes(l), + }; +} + +} // namespace FlexFlow diff --git a/lib/task-spec/src/task-spec/training_layer_tensor_group_signature.cc b/lib/task-spec/src/task-spec/training_layer_tensor_group_signature.cc new file mode 100644 index 0000000000..db8b8015ec --- /dev/null +++ b/lib/task-spec/src/task-spec/training_layer_tensor_group_signature.cc @@ -0,0 +1,31 @@ +#include "task-spec/training_layer_tensor_group_signature.h" +#include + +namespace FlexFlow { + +std::vector get_training_tensor_groups_for_role( + TrainingLayerTensorGroupSignature const &signature, + TensorRole tensor_role) { + + switch (tensor_role) { + case TensorRole::INPUT: + return signature.input_tensor_groups; + case TensorRole::WEIGHT: + return signature.weight_tensor_groups; + case TensorRole::OUTPUT: + return signature.output_tensor_groups; + default: + PANIC("Unhandled TensorRole {}", tensor_role); + } +} + +TrainingTensorGroup get_training_tensor_group_for_role_and_index( + TrainingLayerTensorGroupSignature const &signature, + TensorRole tensor_role, + nonnegative_int index) { + + return get_training_tensor_groups_for_role(signature, tensor_role) + .at(index.unwrap_nonnegative()); +} + +} // namespace FlexFlow diff --git a/lib/task-spec/src/task-spec/training_tensor_group.cc b/lib/task-spec/src/task-spec/training_tensor_group.cc new file mode 100644 index 0000000000..0f6710b80f --- /dev/null +++ b/lib/task-spec/src/task-spec/training_tensor_group.cc @@ -0,0 +1,48 @@ +#include "task-spec/training_tensor_group.h" +#include "pcg/optimizer_attrs.h" +#include "utils/containers/repeat.h" +#include "utils/containers/set_union.h" +#include "utils/containers/transform.h" +#include "utils/containers/unordered_set_of.h" + +namespace FlexFlow { + +TrainingTensorGroup make_training_tensor_group_for_tensor_guid_t( + tensor_guid_t tensor_guid, + TensorAttrs const &tensor_attrs, + OptimizerAttrs const &optimizer_attrs, + ForwardTensorSource &forward_tensor_source, + GradientTensorSource &gradient_tensor_source, + OptimizerTensorSource &optimizer_tensor_source) { + + nonnegative_int num_optimizer_tensors = [&]() { + if (tensor_attrs.create_grad == CreateGrad::YES) { + return get_num_optimizer_tensors(optimizer_attrs); + } else { + return 0_n; + } + }(); + + return TrainingTensorGroup{ + /*forward_tensor=*/forward_tensor_source.new_forward_tensor(), + /*gradient_tensor=*/gradient_tensor_source.new_gradient_tensor(), + /*optimizer_tensors=*/ + repeat(num_optimizer_tensors, + [&]() { return optimizer_tensor_source.new_optimizer_tensor(); }), + }; +} + +std::unordered_set + get_all_training_tensors_in_tensor_group(TrainingTensorGroup const &group) { + return set_union( + std::unordered_set{ + training_tensor_guid_t{group.forward_tensor}, + training_tensor_guid_t{group.gradient_tensor}, + }, + transform(unordered_set_of(group.optimizer_tensors), + [](optimizer_tensor_guid_t optimizer_tensor) { + return training_tensor_guid_t{optimizer_tensor}; + })); +} + +} // namespace FlexFlow diff --git a/lib/task-spec/src/task-spec/training_tensor_group_with_attrs.cc b/lib/task-spec/src/task-spec/training_tensor_group_with_attrs.cc new file mode 100644 index 0000000000..6014b46446 --- /dev/null +++ b/lib/task-spec/src/task-spec/training_tensor_group_with_attrs.cc @@ -0,0 +1,26 @@ +#include "task-spec/training_tensor_group_with_attrs.h" + +namespace FlexFlow { + +TrainingTensorGroupWithAttrs + make_training_tensor_group_with_attrs_from_group_and_attrs( + TrainingTensorGroup const &group, TensorAttrs const &attrs) { + + return TrainingTensorGroupWithAttrs{ + /*tensor_attrs=*/attrs, + /*forward_tensor=*/group.forward_tensor, + /*gradient_tensor=*/group.gradient_tensor, + /*optimizer_tensors=*/group.optimizer_tensors, + }; +} + +TrainingTensorGroup + tensor_group_without_attrs(TrainingTensorGroupWithAttrs const &with_attrs) { + return TrainingTensorGroup{ + /*forward_tensor=*/with_attrs.forward_tensor, + /*gradient_tensor=*/with_attrs.gradient_tensor, + /*optimizer_tensors=*/with_attrs.optimizer_tensors, + }; +} + +} // namespace FlexFlow diff --git a/lib/task-spec/test/src/task-spec/training_tensor_group.cc b/lib/task-spec/test/src/task-spec/training_tensor_group.cc new file mode 100644 index 0000000000..b40c38ce69 --- /dev/null +++ b/lib/task-spec/test/src/task-spec/training_tensor_group.cc @@ -0,0 +1,36 @@ +#include "task-spec/training_tensor_group.h" +#include "test/utils/doctest/fmt/unordered_set.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("get_all_training_tensors_in_tensor_group") { + forward_tensor_guid_t forward_tensor = forward_tensor_guid_t{3}; + gradient_tensor_guid_t gradient_tensor = gradient_tensor_guid_t{5}; + optimizer_tensor_guid_t optimizer_tensor1 = optimizer_tensor_guid_t{8}; + optimizer_tensor_guid_t optimizer_tensor2 = optimizer_tensor_guid_t{3}; + + std::vector optimizer_tensors = { + optimizer_tensor1, + optimizer_tensor2, + }; + + TrainingTensorGroup training_tensor_group = TrainingTensorGroup{ + /*forward_tensor=*/forward_tensor, + /*gradient_tensor=*/gradient_tensor, + /*optimizer_tensors=*/optimizer_tensors, + }; + + std::unordered_set result = + get_all_training_tensors_in_tensor_group(training_tensor_group); + std::unordered_set correct = { + training_tensor_guid_t{forward_tensor}, + training_tensor_guid_t{gradient_tensor}, + training_tensor_guid_t{optimizer_tensor1}, + training_tensor_guid_t{optimizer_tensor2}, + }; + + CHECK(result == correct); + } +} diff --git a/lib/task-spec/test/src/task-spec/training_tensor_group_with_attrs.cc b/lib/task-spec/test/src/task-spec/training_tensor_group_with_attrs.cc new file mode 100644 index 0000000000..f769a877ad --- /dev/null +++ b/lib/task-spec/test/src/task-spec/training_tensor_group_with_attrs.cc @@ -0,0 +1,84 @@ +#include "task-spec/training_tensor_group_with_attrs.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("make_training_tensor_group_with_attrs_from_group_and_attrs") { + TensorAttrs tensor_attrs = TensorAttrs{ + /*shape=*/TensorShape{ + /*dims=*/TensorDims{FFOrdered{ + 8_p, + 2_p, + 3_p, + }}, + /*data_type=*/DataType::FLOAT, + }, + /*create_grad=*/CreateGrad::YES, + }; + + forward_tensor_guid_t forward_tensor = forward_tensor_guid_t{3}; + gradient_tensor_guid_t gradient_tensor = gradient_tensor_guid_t{5}; + std::vector optimizer_tensors = { + optimizer_tensor_guid_t{8}, + optimizer_tensor_guid_t{3}, + }; + + TrainingTensorGroup training_tensor_group = TrainingTensorGroup{ + /*forward_tensor=*/forward_tensor, + /*gradient_tensor=*/gradient_tensor, + /*optimizer_tensors=*/optimizer_tensors, + }; + + TrainingTensorGroupWithAttrs result = + make_training_tensor_group_with_attrs_from_group_and_attrs( + training_tensor_group, tensor_attrs); + TrainingTensorGroupWithAttrs correct = TrainingTensorGroupWithAttrs{ + /*tensor_attrs=*/tensor_attrs, + /*forward_tensor=*/forward_tensor, + /*gradient_tensor=*/gradient_tensor, + /*optimizer_tensors=*/optimizer_tensors, + }; + + CHECK(result == correct); + } + + TEST_CASE("tensor_group_without_attrs") { + TensorAttrs tensor_attrs = TensorAttrs{ + /*shape=*/TensorShape{ + /*dims=*/TensorDims{FFOrdered{ + 8_p, + 2_p, + 3_p, + }}, + /*data_type=*/DataType::FLOAT, + }, + /*create_grad=*/CreateGrad::YES, + }; + + forward_tensor_guid_t forward_tensor = forward_tensor_guid_t{3}; + gradient_tensor_guid_t gradient_tensor = gradient_tensor_guid_t{5}; + std::vector optimizer_tensors = { + optimizer_tensor_guid_t{8}, + optimizer_tensor_guid_t{3}, + }; + + TrainingTensorGroupWithAttrs tensor_group_with_attrs = + TrainingTensorGroupWithAttrs{ + /*tensor_attrs=*/tensor_attrs, + /*forward_tensor=*/forward_tensor, + /*gradient_tensor=*/gradient_tensor, + /*optimizer_tensors=*/optimizer_tensors, + }; + + TrainingTensorGroup result = + tensor_group_without_attrs(tensor_group_with_attrs); + TrainingTensorGroup correct = TrainingTensorGroup{ + /*forward_tensor=*/forward_tensor, + /*gradient_tensor=*/gradient_tensor, + /*optimizer_tensors=*/optimizer_tensors, + }; + + CHECK(result == correct); + } +} diff --git a/lib/utils/include/utils/archetypes/ordered_value_type.h b/lib/utils/include/utils/archetypes/ordered_value_type.h index 5218794fd1..b14f378667 100644 --- a/lib/utils/include/utils/archetypes/ordered_value_type.h +++ b/lib/utils/include/utils/archetypes/ordered_value_type.h @@ -39,6 +39,16 @@ struct ordered_value_type { } }; +template +std::string format_as(ordered_value_type const &) { + PANIC(); +} + +template +std::ostream &operator<<(std::ostream &s, ordered_value_type const &x) { + PANIC(); +} + } // namespace FlexFlow namespace std { diff --git a/lib/utils/include/utils/containers/all_are_true.h b/lib/utils/include/utils/containers/all_are_true.h new file mode 100644 index 0000000000..00a4d6016a --- /dev/null +++ b/lib/utils/include/utils/containers/all_are_true.h @@ -0,0 +1,17 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_ALL_ARE_TRUE_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_ALL_ARE_TRUE_H + +namespace FlexFlow { + +template +bool all_are_true(Container const &c) { + bool result = true; + for (bool b : c) { + result &= b; + } + return result; +} + +} // namespace FlexFlow + +#endif diff --git a/lib/utils/include/utils/containers/collapse_optionals.h b/lib/utils/include/utils/containers/collapse_optionals.h new file mode 100644 index 0000000000..9e39e25a57 --- /dev/null +++ b/lib/utils/include/utils/containers/collapse_optionals.h @@ -0,0 +1,19 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_COLLAPSE_OPTIONALS_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_COLLAPSE_OPTIONALS_H + +#include + +namespace FlexFlow { + +template +std::optional collapse_optionals(std::optional> const &o) { + if (!o.has_value()) { + return std::nullopt; + } + + return o.value(); +} + +} // namespace FlexFlow + +#endif diff --git a/lib/utils/include/utils/containers/contains_value.h b/lib/utils/include/utils/containers/contains_value.h new file mode 100644 index 0000000000..63d21a054a --- /dev/null +++ b/lib/utils/include/utils/containers/contains_value.h @@ -0,0 +1,33 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_CONTAINS_VALUE_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_CONTAINS_VALUE_H + +#include +#include + +namespace FlexFlow { + +template +bool contains_value(std::unordered_map const &m, V const &v) { + for (auto const &[kk, vv] : m) { + if (vv == v) { + return true; + } + } + + return false; +} + +template +bool contains_value(std::map const &m, V const &v) { + for (auto const &[kk, vv] : m) { + if (vv == v) { + return true; + } + } + + return false; +} + +} // namespace FlexFlow + +#endif diff --git a/lib/utils/include/utils/containers/filter_keys.h b/lib/utils/include/utils/containers/filter_keys.h index f240fd2526..0758c48d49 100644 --- a/lib/utils/include/utils/containers/filter_keys.h +++ b/lib/utils/include/utils/containers/filter_keys.h @@ -1,6 +1,7 @@ #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_FILTER_KEYS_H #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_FILTER_KEYS_H +#include #include namespace FlexFlow { @@ -17,6 +18,17 @@ std::unordered_map filter_keys(std::unordered_map const &m, return result; } +template +std::map filter_keys(std::map const &m, F const &f) { + std::map result; + for (std::pair const &kv : m) { + if (f(kv.first)) { + result.insert(kv); + } + } + return result; +} + } // namespace FlexFlow #endif diff --git a/lib/utils/include/utils/containers/filtrans.h b/lib/utils/include/utils/containers/filtrans.h index be1b5093c9..9ee65dee74 100644 --- a/lib/utils/include/utils/containers/filtrans.h +++ b/lib/utils/include/utils/containers/filtrans.h @@ -23,7 +23,7 @@ using unwrap_optional_t = typename unwrap_optional::type; template >> -std::vector filtrans(std::vector const &v, F f) { +std::vector filtrans(std::vector const &v, F &&f) { std::vector result; for (In const &i : v) { @@ -39,7 +39,7 @@ std::vector filtrans(std::vector const &v, F f) { template >> -std::unordered_set filtrans(std::unordered_set const &s, F f) { +std::unordered_set filtrans(std::unordered_set const &s, F &&f) { std::unordered_set result; for (In const &i : s) { @@ -55,7 +55,7 @@ std::unordered_set filtrans(std::unordered_set const &s, F f) { template >> -std::set filtrans(std::set const &s, F f) { +std::set filtrans(std::set const &s, F &&f) { std::set result; for (In const &i : s) { diff --git a/lib/utils/include/utils/containers/flatmap.h b/lib/utils/include/utils/containers/flatmap.h index a7848b88aa..eaa8d1dbef 100644 --- a/lib/utils/include/utils/containers/flatmap.h +++ b/lib/utils/include/utils/containers/flatmap.h @@ -42,6 +42,17 @@ std::unordered_set flatmap_v2(std::unordered_set const &v, return result; } +template >> +std::set flatmap(std::set const &v, F const &f) { + std::set result; + for (auto const &elem : v) { + extend(result, f(elem)); + } + return result; +} + template < typename InK, typename InV, diff --git a/lib/utils/include/utils/exception.h b/lib/utils/include/utils/exception.h index f95eb8a38d..959edcff8a 100644 --- a/lib/utils/include/utils/exception.h +++ b/lib/utils/include/utils/exception.h @@ -31,7 +31,8 @@ T throw_if_unexpected(tl::expected const &r) { if (r.has_value()) { return r.value(); } else { - throw std::runtime_error(fmt::to_string(r.error())); + PANIC(fmt::to_string(r.error())); + ; } } diff --git a/lib/utils/include/utils/fmt/half.h b/lib/utils/include/utils/fmt/half.h new file mode 100644 index 0000000000..9cc1b5c1e7 --- /dev/null +++ b/lib/utils/include/utils/fmt/half.h @@ -0,0 +1,26 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_FMT_HALF_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_FMT_HALF_H + +#include "utils/half.h" +#include + +namespace fmt { + +template +struct formatter<::half, Char> : formatter { + template + auto format(::half const &h, FormatContext &ctx) -> decltype(ctx.out()) { + + return formatter::format(h, ctx); + } +}; + +} // namespace fmt + +namespace FlexFlow { + +std::ostream &operator<<(std::ostream &, ::half); + +} // namespace FlexFlow + +#endif diff --git a/lib/utils/include/utils/fmt/set.h b/lib/utils/include/utils/fmt/set.h index c46984cc5a..d619d91500 100644 --- a/lib/utils/include/utils/fmt/set.h +++ b/lib/utils/include/utils/fmt/set.h @@ -4,6 +4,7 @@ #include "utils/check_fmtable.h" #include "utils/containers/sorted.h" #include "utils/join_strings.h" +#include "utils/type_traits_core.h" #include #include #include @@ -13,7 +14,7 @@ namespace fmt { template struct formatter<::std::set, Char, - std::enable_if_t>::value>> + std::enable_if_t>::value>> : formatter<::std::string> { template auto format(::std::set const &m, FormatContext &ctx) const diff --git a/lib/utils/include/utils/fp16.h b/lib/utils/include/utils/half.h similarity index 100% rename from lib/utils/include/utils/fp16.h rename to lib/utils/include/utils/half.h diff --git a/lib/utils/include/utils/json/half.h b/lib/utils/include/utils/json/half.h new file mode 100644 index 0000000000..a16d03a3e2 --- /dev/null +++ b/lib/utils/include/utils/json/half.h @@ -0,0 +1,17 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_JSON_HALF_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_JSON_HALF_H + +#include "utils/half.h" +#include + +namespace nlohmann { + +template <> +struct adl_serializer { + static void to_json(json &j, half x); + static void from_json(json const &j, half &t); +}; + +} // namespace nlohmann + +#endif diff --git a/lib/utils/include/utils/nonnegative_int/nonnegative_int.h b/lib/utils/include/utils/nonnegative_int/nonnegative_int.h index a266ddea77..c775cfc9ed 100644 --- a/lib/utils/include/utils/nonnegative_int/nonnegative_int.h +++ b/lib/utils/include/utils/nonnegative_int/nonnegative_int.h @@ -14,6 +14,7 @@ class nonnegative_int { nonnegative_int() = delete; explicit nonnegative_int(int value); explicit nonnegative_int(size_t value); + explicit nonnegative_int(unsigned long long int value); explicit operator int() const noexcept; diff --git a/lib/utils/include/utils/nonnegative_int/nonnegative_range.h b/lib/utils/include/utils/nonnegative_int/nonnegative_range.h index af323aef42..149671e243 100644 --- a/lib/utils/include/utils/nonnegative_int/nonnegative_range.h +++ b/lib/utils/include/utils/nonnegative_int/nonnegative_range.h @@ -2,10 +2,12 @@ #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_NONNEGATIVE_RANGE_H #include "utils/nonnegative_int/nonnegative_int.h" +#include "utils/positive_int/positive_int.h" namespace FlexFlow { std::vector nonnegative_range(nonnegative_int end); +std::vector nonnegative_range(positive_int end); std::vector nonnegative_range(nonnegative_int start, nonnegative_int end, int step = 1); diff --git a/lib/utils/include/utils/rapidcheck/half.h b/lib/utils/include/utils/rapidcheck/half.h new file mode 100644 index 0000000000..ffa85ed41f --- /dev/null +++ b/lib/utils/include/utils/rapidcheck/half.h @@ -0,0 +1,16 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_RAPIDCHECK_HALF_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_RAPIDCHECK_HALF_H + +#include "utils/half.h" +#include + +namespace rc { + +template <> +struct Arbitrary<::half> { + static Gen<::half> arbitrary(); +}; + +} // namespace rc + +#endif diff --git a/lib/utils/include/utils/rapidcheck/monostate.h b/lib/utils/include/utils/rapidcheck/monostate.h new file mode 100644 index 0000000000..b34c069574 --- /dev/null +++ b/lib/utils/include/utils/rapidcheck/monostate.h @@ -0,0 +1,16 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_RAPIDCHECK_MONOSTATE_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_RAPIDCHECK_MONOSTATE_H + +#include +#include + +namespace rc { + +template <> +struct Arbitrary { + static Gen arbitrary(); +}; + +} // namespace rc + +#endif diff --git a/lib/utils/include/utils/units/milliseconds_t.h b/lib/utils/include/utils/units/milliseconds_t.h new file mode 100644 index 0000000000..ed3d5776a3 --- /dev/null +++ b/lib/utils/include/utils/units/milliseconds_t.h @@ -0,0 +1,67 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_UNITS_MILLISECONDS_T_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_UNITS_MILLISECONDS_T_H + +#include +#include +#include +#include + +namespace FlexFlow { + +struct milliseconds_t { +public: + milliseconds_t() = delete; + explicit milliseconds_t(float); + + bool operator<(milliseconds_t const &other) const; + bool operator==(milliseconds_t const &other) const; + bool operator>(milliseconds_t const &other) const; + bool operator<=(milliseconds_t const &other) const; + bool operator!=(milliseconds_t const &other) const; + bool operator>=(milliseconds_t const &other) const; + + milliseconds_t operator+(milliseconds_t const &other) const; + + float unwrap_milliseconds() const; + +private: + float value; +}; + +milliseconds_t operator""_ms(long double); +milliseconds_t operator""_ms(unsigned long long int); + +std::ostream &operator<<(std::ostream &, milliseconds_t const &); +std::string format_as(milliseconds_t const &); + +} // namespace FlexFlow + +namespace nlohmann { + +template <> +struct adl_serializer<::FlexFlow::milliseconds_t> { + static ::FlexFlow::milliseconds_t from_json(json const &j); + static void to_json(json &j, ::FlexFlow::milliseconds_t t); +}; + +} // namespace nlohmann + +namespace rc { + +template <> +struct Arbitrary<::FlexFlow::milliseconds_t> { + static Gen<::FlexFlow::milliseconds_t> arbitrary(); +}; + +} // namespace rc + +namespace std { + +template <> +struct hash<::FlexFlow::milliseconds_t> { + size_t operator()(::FlexFlow::milliseconds_t const &) const noexcept; +}; + +} // namespace std + +#endif diff --git a/lib/utils/include/utils/units/num_bytes_t.h b/lib/utils/include/utils/units/num_bytes_t.h new file mode 100644 index 0000000000..453cf4c84f --- /dev/null +++ b/lib/utils/include/utils/units/num_bytes_t.h @@ -0,0 +1,62 @@ +#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_UNITS_NUM_BYTES_T_H +#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_UNITS_NUM_BYTES_T_H + +#include "utils/nonnegative_int/nonnegative_int.h" +namespace FlexFlow { + +struct num_bytes_t { +public: + num_bytes_t() = delete; + explicit num_bytes_t(nonnegative_int); + + bool operator<(num_bytes_t const &other) const; + bool operator==(num_bytes_t const &other) const; + bool operator>(num_bytes_t const &other) const; + bool operator<=(num_bytes_t const &other) const; + bool operator!=(num_bytes_t const &other) const; + bool operator>=(num_bytes_t const &other) const; + + num_bytes_t operator+(num_bytes_t const &other) const; + + nonnegative_int unwrap_num_bytes() const; + +private: + nonnegative_int value; +}; + +num_bytes_t operator""_bytes(unsigned long long int); + +std::ostream &operator<<(std::ostream &, num_bytes_t const &); +std::string format_as(num_bytes_t const &); + +} // namespace FlexFlow + +namespace nlohmann { + +template <> +struct adl_serializer<::FlexFlow::num_bytes_t> { + static ::FlexFlow::num_bytes_t from_json(json const &j); + static void to_json(json &j, ::FlexFlow::num_bytes_t t); +}; + +} // namespace nlohmann + +namespace rc { + +template <> +struct Arbitrary<::FlexFlow::num_bytes_t> { + static Gen<::FlexFlow::num_bytes_t> arbitrary(); +}; + +} // namespace rc + +namespace std { + +template <> +struct hash<::FlexFlow::num_bytes_t> { + size_t operator()(::FlexFlow::num_bytes_t const &) const noexcept; +}; + +} // namespace std + +#endif diff --git a/lib/utils/src/fp16.cc b/lib/utils/src/half.cc similarity index 87% rename from lib/utils/src/fp16.cc rename to lib/utils/src/half.cc index f9dbf486ab..3dbea5c4dc 100644 --- a/lib/utils/src/fp16.cc +++ b/lib/utils/src/half.cc @@ -1,4 +1,4 @@ -#include "utils/fp16.h" +#include "utils/half.h" #include "utils/hash-utils.h" namespace std { diff --git a/lib/utils/src/utils/containers/all_are_true.cc b/lib/utils/src/utils/containers/all_are_true.cc new file mode 100644 index 0000000000..5647069f0e --- /dev/null +++ b/lib/utils/src/utils/containers/all_are_true.cc @@ -0,0 +1,10 @@ +#include "utils/containers/all_are_true.h" +#include + +namespace FlexFlow { + +using Container = std::vector; + +template bool all_are_true(Container const &); + +} // namespace FlexFlow diff --git a/lib/utils/src/utils/containers/collapse_optionals.cc b/lib/utils/src/utils/containers/collapse_optionals.cc new file mode 100644 index 0000000000..b55b16a908 --- /dev/null +++ b/lib/utils/src/utils/containers/collapse_optionals.cc @@ -0,0 +1,11 @@ +#include "utils/containers/collapse_optionals.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using T = value_type<0>; + +template std::optional + collapse_optionals(std::optional> const &); + +} // namespace FlexFlow diff --git a/lib/utils/src/utils/containers/contains_value.cc b/lib/utils/src/utils/containers/contains_value.cc new file mode 100644 index 0000000000..d9d2118658 --- /dev/null +++ b/lib/utils/src/utils/containers/contains_value.cc @@ -0,0 +1,13 @@ +#include "utils/containers/contains_value.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using K = value_type<0>; +using V = value_type<1>; + +template bool contains_value(std::unordered_map const &, V const &); + +template bool contains_value(std::map const &, V const &); + +} // namespace FlexFlow diff --git a/lib/utils/src/utils/containers/filtrans.cc b/lib/utils/src/utils/containers/filtrans.cc index a57a743ef0..c65a22a669 100644 --- a/lib/utils/src/utils/containers/filtrans.cc +++ b/lib/utils/src/utils/containers/filtrans.cc @@ -1 +1,12 @@ #include "utils/containers/filtrans.h" +#include "utils/archetypes/value_type.h" + +namespace FlexFlow { + +using In = value_type<0>; +using Out = value_type<1>; +using F = std::function(In const &)>; + +template std::vector filtrans(std::vector const &, F &&); + +} // namespace FlexFlow diff --git a/lib/utils/src/utils/fmt/half.cc b/lib/utils/src/utils/fmt/half.cc new file mode 100644 index 0000000000..0075e6e7a7 --- /dev/null +++ b/lib/utils/src/utils/fmt/half.cc @@ -0,0 +1,9 @@ +#include "utils/fmt/half.h" + +namespace FlexFlow { + +std::ostream &operator<<(std::ostream &s, ::half h) { + return (s << static_cast(h)); +} + +} // namespace FlexFlow diff --git a/lib/utils/src/utils/fmt/set.cc b/lib/utils/src/utils/fmt/set.cc index 857367af48..db439414c9 100644 --- a/lib/utils/src/utils/fmt/set.cc +++ b/lib/utils/src/utils/fmt/set.cc @@ -1 +1,16 @@ #include "utils/fmt/set.h" +#include "utils/archetypes/ordered_value_type.h" + +using T = ::FlexFlow::ordered_value_type<0>; + +namespace fmt { + +template struct formatter<::std::set, char>; + +} + +namespace FlexFlow { + +template std::ostream &operator<<(std::ostream &, std::set const &); + +} diff --git a/lib/utils/src/utils/json/half.cc b/lib/utils/src/utils/json/half.cc new file mode 100644 index 0000000000..6555de13c5 --- /dev/null +++ b/lib/utils/src/utils/json/half.cc @@ -0,0 +1,13 @@ +#include "utils/json/half.h" + +namespace nlohmann { + +void adl_serializer::to_json(json &j, half x) { + j = static_cast(x); +} + +void adl_serializer::from_json(json const &j, half &x) { + x = j.get(); +} + +} // namespace nlohmann diff --git a/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc b/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc index 3472a7eee2..7593a8e9ec 100644 --- a/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc +++ b/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc @@ -4,20 +4,20 @@ namespace FlexFlow { nonnegative_int::nonnegative_int(int value) { - if (value < 0) { - throw std::invalid_argument( - "Value of nonnegative_int type must be nonnegative."); - } + ASSERT(value >= 0, "Value of nonnegative_int must be nonnegative"); this->value_ = value; } nonnegative_int::nonnegative_int(size_t value) { - if (value > std::numeric_limits::max()) { - throw std::invalid_argument(fmt::format( - "Input {} to nonnegative_int(size_t) is out-of-bounds for int", value)); - } + ASSERT(value <= std::numeric_limits::max()); + this->value_ = static_cast(value); + ASSERT(this->value_ >= 0, "Value of nonnegative_int must be nonnegative"); +} + +nonnegative_int::nonnegative_int(unsigned long long int value) { + ASSERT(value <= std::numeric_limits::max()); this->value_ = static_cast(value); - assert(this->value_ >= 0); + ASSERT(this->value_ >= 0, "Value of nonnegative_int must be nonnegative"); } nonnegative_int::operator int() const noexcept { @@ -27,18 +27,23 @@ nonnegative_int::operator int() const noexcept { bool nonnegative_int::operator<(nonnegative_int const &other) const { return this->value_ < other.value_; } + bool nonnegative_int::operator==(nonnegative_int const &other) const { return this->value_ == other.value_; } + bool nonnegative_int::operator>(nonnegative_int const &other) const { return this->value_ > other.value_; } + bool nonnegative_int::operator<=(nonnegative_int const &other) const { return this->value_ <= other.value_; } + bool nonnegative_int::operator!=(nonnegative_int const &other) const { return this->value_ != other.value_; } + bool nonnegative_int::operator>=(nonnegative_int const &other) const { return this->value_ >= other.value_; } @@ -46,18 +51,23 @@ bool nonnegative_int::operator>=(nonnegative_int const &other) const { bool nonnegative_int::operator<(int const &other) const { return this->value_ < other; } + bool nonnegative_int::operator==(int const &other) const { return this->value_ == other; } + bool nonnegative_int::operator>(int const &other) const { return this->value_ > other; } + bool nonnegative_int::operator<=(int const &other) const { return this->value_ <= other; } + bool nonnegative_int::operator!=(int const &other) const { return this->value_ != other; } + bool nonnegative_int::operator>=(int const &other) const { return this->value_ >= other; } @@ -65,18 +75,23 @@ bool nonnegative_int::operator>=(int const &other) const { bool operator<(int const &lhs, nonnegative_int const &rhs) { return lhs < rhs.value_; } + bool operator==(int const &lhs, nonnegative_int const &rhs) { return lhs == rhs.value_; } + bool operator>(int const &lhs, nonnegative_int const &rhs) { return lhs > rhs.value_; } + bool operator<=(int const &lhs, nonnegative_int const &rhs) { return lhs <= rhs.value_; } + bool operator!=(int const &lhs, nonnegative_int const &rhs) { return lhs != rhs.value_; } + bool operator>=(int const &lhs, nonnegative_int const &rhs) { return lhs >= rhs.value_; } diff --git a/lib/utils/src/utils/nonnegative_int/nonnegative_range.cc b/lib/utils/src/utils/nonnegative_int/nonnegative_range.cc index f31db6d589..8195759388 100644 --- a/lib/utils/src/utils/nonnegative_int/nonnegative_range.cc +++ b/lib/utils/src/utils/nonnegative_int/nonnegative_range.cc @@ -9,6 +9,10 @@ std::vector nonnegative_range(nonnegative_int end) { [](int x) { return nonnegative_int{x}; }); } +std::vector nonnegative_range(positive_int end) { + return nonnegative_range(end.nonnegative_int_from_positive_int()); +} + std::vector nonnegative_range(nonnegative_int start, nonnegative_int end, int step) { return transform( diff --git a/lib/utils/src/utils/rapidcheck/half.cc b/lib/utils/src/utils/rapidcheck/half.cc new file mode 100644 index 0000000000..80d009364a --- /dev/null +++ b/lib/utils/src/utils/rapidcheck/half.cc @@ -0,0 +1,9 @@ +#include "utils/rapidcheck/half.h" + +namespace rc { + +Gen<::half> Arbitrary<::half>::arbitrary() { + return gen::construct<::half>(gen::arbitrary()); +} + +} // namespace rc diff --git a/lib/utils/src/utils/rapidcheck/monostate.cc b/lib/utils/src/utils/rapidcheck/monostate.cc new file mode 100644 index 0000000000..96c72373aa --- /dev/null +++ b/lib/utils/src/utils/rapidcheck/monostate.cc @@ -0,0 +1,9 @@ +#include "utils/rapidcheck/monostate.h" + +namespace rc { + +Gen Arbitrary::arbitrary() { + return gen::construct(); +} + +} // namespace rc diff --git a/lib/utils/src/utils/units/milliseconds_t.cc b/lib/utils/src/utils/units/milliseconds_t.cc new file mode 100644 index 0000000000..fb0dd01d64 --- /dev/null +++ b/lib/utils/src/utils/units/milliseconds_t.cc @@ -0,0 +1,94 @@ +#include "utils/units/milliseconds_t.h" +#include "utils/hash-utils.h" +#include +#include +#include + +namespace FlexFlow { + +milliseconds_t::milliseconds_t(float value) : value(value) {} + +bool milliseconds_t::operator<(milliseconds_t const &other) const { + return this->value < other.value; +} + +bool milliseconds_t::operator==(milliseconds_t const &other) const { + return this->value == other.value; +} + +bool milliseconds_t::operator>(milliseconds_t const &other) const { + return this->value > other.value; +} + +bool milliseconds_t::operator<=(milliseconds_t const &other) const { + return this->value <= other.value; +} + +bool milliseconds_t::operator!=(milliseconds_t const &other) const { + return this->value != other.value; +} + +bool milliseconds_t::operator>=(milliseconds_t const &other) const { + return this->value >= other.value; +} + +milliseconds_t milliseconds_t::operator+(milliseconds_t const &other) const { + return milliseconds_t{ + this->value + other.value, + }; +} + +float milliseconds_t::unwrap_milliseconds() const { + return this->value; +} + +milliseconds_t operator""_ms(long double x) { + ASSERT(x <= std::numeric_limits::max()); + ASSERT(x >= std::numeric_limits::lowest()); + return milliseconds_t{static_cast(x)}; +} + +milliseconds_t operator""_ms(unsigned long long int x) { + ASSERT(x <= std::numeric_limits::max()); + return milliseconds_t{static_cast(x)}; +} + +std::ostream &operator<<(std::ostream &s, milliseconds_t const &m) { + return (s << fmt::to_string(m)); +} + +std::string format_as(milliseconds_t const &m) { + return fmt::format("{}_ms", m.unwrap_milliseconds()); +} + +} // namespace FlexFlow + +namespace nlohmann { +::FlexFlow::milliseconds_t + adl_serializer<::FlexFlow::milliseconds_t>::from_json(json const &j) { + return ::FlexFlow::milliseconds_t{j.template get()}; +} + +void adl_serializer<::FlexFlow::milliseconds_t>::to_json( + json &j, ::FlexFlow::milliseconds_t t) { + j = t.unwrap_milliseconds(); +} +} // namespace nlohmann + +namespace rc { + +Gen<::FlexFlow::milliseconds_t> + Arbitrary<::FlexFlow::milliseconds_t>::arbitrary() { + return gen::construct<::FlexFlow::milliseconds_t>(gen::arbitrary()); +} + +} // namespace rc + +namespace std { + +size_t hash<::FlexFlow::milliseconds_t>::operator()( + ::FlexFlow::milliseconds_t const &m) const noexcept { + return ::FlexFlow::get_std_hash(m.unwrap_milliseconds()); +} + +} // namespace std diff --git a/lib/utils/src/utils/units/num_bytes_t.cc b/lib/utils/src/utils/units/num_bytes_t.cc new file mode 100644 index 0000000000..f845d0a91b --- /dev/null +++ b/lib/utils/src/utils/units/num_bytes_t.cc @@ -0,0 +1,87 @@ +#include "utils/units/num_bytes_t.h" +#include "utils/hash-utils.h" +#include +#include +#include + +namespace FlexFlow { + +num_bytes_t::num_bytes_t(nonnegative_int value) : value(value) {} + +bool num_bytes_t::operator<(num_bytes_t const &other) const { + return this->value < other.value; +} + +bool num_bytes_t::operator==(num_bytes_t const &other) const { + return this->value == other.value; +} + +bool num_bytes_t::operator>(num_bytes_t const &other) const { + return this->value > other.value; +} + +bool num_bytes_t::operator<=(num_bytes_t const &other) const { + return this->value <= other.value; +} + +bool num_bytes_t::operator!=(num_bytes_t const &other) const { + return this->value != other.value; +} + +bool num_bytes_t::operator>=(num_bytes_t const &other) const { + return this->value >= other.value; +} + +num_bytes_t num_bytes_t::operator+(num_bytes_t const &other) const { + return num_bytes_t{ + this->value + other.value, + }; +} + +nonnegative_int num_bytes_t::unwrap_num_bytes() const { + return this->value; +} + +num_bytes_t operator""_bytes(unsigned long long int x) { + return num_bytes_t{nonnegative_int{x}}; +} + +std::ostream &operator<<(std::ostream &s, num_bytes_t const &m) { + return (s << fmt::to_string(m)); +} + +std::string format_as(num_bytes_t const &m) { + return fmt::format("{}_bytes", m.unwrap_num_bytes()); +} + +} // namespace FlexFlow + +namespace nlohmann { +::FlexFlow::num_bytes_t + adl_serializer<::FlexFlow::num_bytes_t>::from_json(json const &j) { + return ::FlexFlow::num_bytes_t{j.template get<::FlexFlow::nonnegative_int>()}; +} + +void adl_serializer<::FlexFlow::num_bytes_t>::to_json( + json &j, ::FlexFlow::num_bytes_t t) { + j = t.unwrap_num_bytes(); +} +} // namespace nlohmann + +namespace rc { + +Gen<::FlexFlow::num_bytes_t> Arbitrary<::FlexFlow::num_bytes_t>::arbitrary() { + return gen::construct<::FlexFlow::num_bytes_t>( + gen::arbitrary<::FlexFlow::nonnegative_int>()); +} + +} // namespace rc + +namespace std { + +size_t hash<::FlexFlow::num_bytes_t>::operator()( + ::FlexFlow::num_bytes_t const &m) const noexcept { + return ::FlexFlow::get_std_hash(m.unwrap_num_bytes()); +} + +} // namespace std diff --git a/lib/utils/test/common/include/test/utils/doctest/fmt/half.h b/lib/utils/test/common/include/test/utils/doctest/fmt/half.h new file mode 100644 index 0000000000..f3694bb981 --- /dev/null +++ b/lib/utils/test/common/include/test/utils/doctest/fmt/half.h @@ -0,0 +1,16 @@ +#ifndef _FLEXFLOW_LIB_UTILS_TEST_COMMON_INCLUDE_TEST_UTILS_DOCTEST_FMT_HALF_H +#define _FLEXFLOW_LIB_UTILS_TEST_COMMON_INCLUDE_TEST_UTILS_DOCTEST_FMT_HALF_H + +#include "utils/half.h" +#include + +namespace doctest { + +template <> +struct StringMaker<::half> { + static String convert(::half const &); +}; + +} // namespace doctest + +#endif diff --git a/lib/utils/test/common/src/main.cc b/lib/utils/test/common/src/main.cc index 6df2d925b7..9c72e6310c 100644 --- a/lib/utils/test/common/src/main.cc +++ b/lib/utils/test/common/src/main.cc @@ -1,6 +1,7 @@ #define DOCTEST_CONFIG_IMPLEMENT #include +#include #include #include @@ -10,6 +11,7 @@ void libassert_throw_exception_handler(libassert::assertion_info const &info) { int main(int argc, char **argv) { libassert::set_failure_handler(libassert_throw_exception_handler); + cpptrace::register_terminate_handler(); return doctest::Context(argc, argv).run(); } diff --git a/lib/utils/test/common/src/test/utils/doctest/fmt/half.cc b/lib/utils/test/common/src/test/utils/doctest/fmt/half.cc new file mode 100644 index 0000000000..c2e8124678 --- /dev/null +++ b/lib/utils/test/common/src/test/utils/doctest/fmt/half.cc @@ -0,0 +1,9 @@ +#include "test/utils/doctest/fmt/half.h" + +namespace doctest { + +String StringMaker<::half>::convert(::half const &h) { + return toString(static_cast(h)); +} + +} // namespace doctest diff --git a/lib/utils/test/src/utils/containers/all_are_true.cc b/lib/utils/test/src/utils/containers/all_are_true.cc new file mode 100644 index 0000000000..3a725a7d00 --- /dev/null +++ b/lib/utils/test/src/utils/containers/all_are_true.cc @@ -0,0 +1,36 @@ +#include "utils/containers/all_are_true.h" +#include +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("all_are_true") { + SUBCASE("all elements are true") { + std::vector input = {true, true, true}; + + bool result = all_are_true(input); + bool correct = true; + + CHECK(result == correct); + } + + SUBCASE("not all elements are true") { + std::vector input = {true, false, true, false}; + + bool result = all_are_true(input); + bool correct = false; + + CHECK(result == correct); + } + + SUBCASE("empty input vector") { + std::vector input = {}; + + bool result = all_are_true(input); + bool correct = true; + + CHECK(result == correct); + } + } +} diff --git a/lib/utils/test/src/utils/containers/collapse_optionals.cc b/lib/utils/test/src/utils/containers/collapse_optionals.cc new file mode 100644 index 0000000000..201b1bdf02 --- /dev/null +++ b/lib/utils/test/src/utils/containers/collapse_optionals.cc @@ -0,0 +1,38 @@ +#include "utils/containers/collapse_optionals.h" +#include "test/utils/doctest/fmt/optional.h" +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("collapse_optionals(std::optional>)") { + SUBCASE("returns the value if the input has value") { + std::optional> input = 8; + + std::optional result = collapse_optionals(input); + std::optional correct = 8; + + CHECK(result == correct); + } + + SUBCASE("returns nullopt if the input is wrapped nullopt") { + std::optional> input = + std::optional{std::nullopt}; + + std::optional result = collapse_optionals(input); + std::optional correct = std::nullopt; + + CHECK(result == correct); + } + + SUBCASE("returns nullopt if the input is unwrapped nullopt") { + std::optional> input = + std::optional>{std::nullopt}; + + std::optional result = collapse_optionals(input); + std::optional correct = std::nullopt; + + CHECK(result == correct); + } + } +} diff --git a/lib/utils/test/src/utils/containers/contains_value.cc b/lib/utils/test/src/utils/containers/contains_value.cc new file mode 100644 index 0000000000..136ef3b304 --- /dev/null +++ b/lib/utils/test/src/utils/containers/contains_value.cc @@ -0,0 +1,51 @@ +#include "utils/containers/contains_value.h" +#include +#include + +using namespace ::FlexFlow; + +TEST_SUITE(FF_TEST_SUITE) { + TEST_CASE("contains_value(std::unordered_map, V)") { + std::unordered_map m = { + {1, "one"}, + {3, "three"}, + {4, "three"}, + }; + + SUBCASE("returns true if the value is in the map") { + bool result = contains_value(m, std::string{"one"}); + bool correct = true; + + CHECK(result == correct); + } + + SUBCASE("returns false if the value is not in the map") { + bool result = contains_value(m, std::string{"two"}); + bool correct = false; + + CHECK(result == correct); + } + } + + TEST_CASE("contains_value(std::map, V)") { + std::map m = { + {1, "one"}, + {3, "three"}, + {4, "three"}, + }; + + SUBCASE("returns true if the value is in the map") { + bool result = contains_value(m, std::string{"one"}); + bool correct = true; + + CHECK(result == correct); + } + + SUBCASE("returns false if the value is not in the map") { + bool result = contains_value(m, std::string{"two"}); + bool correct = false; + + CHECK(result == correct); + } + } +} diff --git a/lib/utils/test/src/utils/positive_int/positive_int.cc b/lib/utils/test/src/utils/positive_int/positive_int.cc index 77ecbf854d..88454bbfbd 100644 --- a/lib/utils/test/src/utils/positive_int/positive_int.cc +++ b/lib/utils/test/src/utils/positive_int/positive_int.cc @@ -1,6 +1,6 @@ #include "utils/positive_int/positive_int.h" -#include #include "test/utils/rapidcheck.h" +#include using namespace ::FlexFlow; @@ -51,16 +51,16 @@ TEST_SUITE(FF_TEST_SUITE) { } TEST_CASE("_p notation for positive_int") { - CHECK(9_p == positive_int{9}); + CHECK(9_p == positive_int{9}); CHECK_THROWS(0_p); } TEST_CASE("static_cast(positive_int)") { - CHECK(static_cast(8_p) == 8); + CHECK(static_cast(8_p) == 8); } TEST_CASE("static_cast(positive_int)") { - CHECK(static_cast(6_p) == 6); + CHECK(static_cast(6_p) == 6); } TEST_CASE("positive_int < positive_int") { @@ -321,7 +321,7 @@ TEST_SUITE(FF_TEST_SUITE) { } TEST_CASE("positive_int * positive_int") { - CHECK(3_p * 4_p == 12_p); + CHECK(3_p * 4_p == 12_p); } TEST_CASE("positive_int *= positive_int") { @@ -339,7 +339,7 @@ TEST_SUITE(FF_TEST_SUITE) { } TEST_CASE("positive_int * nonnegative_int") { - CHECK(3_p * 4_n == 12_n); + CHECK(3_p * 4_n == 12_n); CHECK(3_p * 0_n == 0_n); } @@ -360,10 +360,10 @@ TEST_SUITE(FF_TEST_SUITE) { } TEST_CASE("float / positive_int") { - CHECK(4.0f / 2_p == 2.0f); - CHECK(3.0f / 2_p == 1.5f); - CHECK(-3.0f / 4_p == -0.75f); - CHECK(0.0f / 1_p == 0.0f); + CHECK(4.0f / 2_p == 2.0f); + CHECK(3.0f / 2_p == 1.5f); + CHECK(-3.0f / 4_p == -0.75f); + CHECK(0.0f / 1_p == 0.0f); } TEST_CASE("float /= positive_int") { @@ -411,11 +411,11 @@ TEST_SUITE(FF_TEST_SUITE) { } TEST_CASE("positive_int::int_from_positive_int()") { - CHECK((3_p).int_from_positive_int() == 3); + CHECK((3_p).int_from_positive_int() == 3); } TEST_CASE("positive_int::nonnegative_int_from_positive_int()") { - CHECK((4_p).nonnegative_int_from_positive_int() == 4); + CHECK((4_p).nonnegative_int_from_positive_int() == 4); } TEST_CASE("positive_int::operator<<(std::ostream &, positive_int)") { @@ -443,7 +443,7 @@ TEST_SUITE(FF_TEST_SUITE) { nlohmann::json correct = 5; CHECK(result == correct); - } + } SUBCASE("from_json") { nlohmann::json input = 5; @@ -480,6 +480,6 @@ TEST_SUITE(FF_TEST_SUITE) { } TEST_CASE("rc::Arbitrary") { - RC_SUBCASE([](positive_int) { }); + RC_SUBCASE([](positive_int) {}); } } From 426206e90d63f3ab2d6411a1c1a300883b9cd472 Mon Sep 17 00:00:00 2001 From: fruitea Date: Tue, 5 Aug 2025 20:42:59 -0700 Subject: [PATCH 90/91] fix: e2e test for realm backend --- .../src/realm_training_backing.cc | 4 +- lib/realm-backend/test/src/test_e2e.cc | 2 +- lib/realm-backend/test/src/test_update.cc | 124 ------------------ 3 files changed, 4 insertions(+), 126 deletions(-) delete mode 100644 lib/realm-backend/test/src/test_update.cc diff --git a/lib/realm-backend/src/realm_training_backing.cc b/lib/realm-backend/src/realm_training_backing.cc index b436443cdb..66bc098e07 100644 --- a/lib/realm-backend/src/realm_training_backing.cc +++ b/lib/realm-backend/src/realm_training_backing.cc @@ -22,7 +22,7 @@ namespace FlexFlow { using namespace Realm; -LocalTrainingBacking make_local_training_backing_for_computation_graph( +LocalTrainingBacking make_realm_training_backing_for_computation_graph( RealmRuntimeState &runtime_state, std::unordered_map const &preallocated, @@ -267,6 +267,8 @@ Future execute_update(LocalTrainingBacking const &local_training_backing, runtime_state.worker_events[0] = e; future.set_event(e); return future; + } else { + return Future(); } } diff --git a/lib/realm-backend/test/src/test_e2e.cc b/lib/realm-backend/test/src/test_e2e.cc index 66ff034240..f7a338d32b 100644 --- a/lib/realm-backend/test/src/test_e2e.cc +++ b/lib/realm-backend/test/src/test_e2e.cc @@ -194,6 +194,6 @@ void top_level_task(const void *args, size_t arglen, const void *userdata, GenericTensorAccessorR first_epoch_loss = loss_values.at(0); GenericTensorAccessorR last_epoch = loss_values.back(); assert(did_loss_decrease(first_epoch_loss, last_epoch)); - printf("passed\n"); + printf("passed!\n"); } } diff --git a/lib/realm-backend/test/src/test_update.cc b/lib/realm-backend/test/src/test_update.cc deleted file mode 100644 index cd7119271d..0000000000 --- a/lib/realm-backend/test/src/test_update.cc +++ /dev/null @@ -1,124 +0,0 @@ -#include "kernels/managed_ff_stream.h" -#include "kernels/managed_per_device_ff_handle.h" -#include "local-execution/allocated_tensors.h" -#include "pcg/computation_graph.h" -#include "pcg/computation_graph_builder.h" -#include "pcg/optimizer_attrs.dtg.h" -#include "realm-backend/driver.h" -#include "realm-backend/realm_allocator.h" -#include "realm-backend/local_training_backing.h" -#include "test_utils.h" - -using namespace ::FlexFlow; -using namespace Realm; - -void top_level_task(const void *args, size_t arglen, const void *userdata, - size_t userlen, Realm::Processor p) { - // initialize runtime configs - ManagedFFStream managed_stream{}; - ManagedPerDeviceFFHandle managed_handle = initialize_single_gpu_handle(); - std::vector worker_procs; - std::vector allocators; - Machine::ProcessorQuery pq = Machine::ProcessorQuery(Machine::get_machine()) - .only_kind(Processor::TOC_PROC); - assert(pq.count() > 0); - for (Processor p : pq) { - worker_procs.push_back(p); - allocators.push_back(create_realm_memory_allocator(p)); - } - - AllocatedTensors allocated_tensors = make_empty_allocated_tensors(); - - // construct computation graph - ComputationGraph computation_graph = make_empty_computation_graph(); - - nonnegative_int batch_size = 10_n; - nonnegative_int data_dim = 16_n; - nonnegative_int output_dim = 32_n; - - TensorShape input_tensor_shape = - TensorShape{TensorDims{FFOrdered{batch_size, data_dim}}, - DataType::FLOAT}; - - TensorShape weight_shape = - TensorShape{TensorDims{FFOrdered{data_dim, output_dim}}, - DataType::FLOAT}; - - LayerAddedResult inputs_layer = - add_input_layer(computation_graph, input_tensor_shape); - - LayerAddedResult weights_layer = add_layer( - computation_graph, - LayerAttrs{ComputationGraphOpAttrs{WeightAttrs{ - weight_shape, InitializerAttrs{ZeroInitializerAttrs{}}}}, - "weights"}, - {}, {}); - - LayerAddedResult linear_operator = - add_layer(computation_graph, - LayerAttrs{ComputationGraphOpAttrs{ - LinearAttrs{output_dim, - /*use_bias=*/false, DataType::FLOAT, - Activation::RELU, std::nullopt}}, - "linear"}, - inputs_layer.outputs, weights_layer.outputs); - - RuntimeArgConfig runtime_arg_config = RuntimeArgConfig{ - DeviceSpecific::create(managed_handle.raw_handle()), - EnableProfiling::YES, - ProfilingSettings{/*warmup_iters=*/0, /*measure_iters=*/1}}; - - GradientTensorSource gradient_tensor_source; - OptimizerTensorSource optimizer_tensor_source; - - int test_id = 0; - - { - printf("\nRunning test %d: SGDOptimizerAttrs, momentum=0...\n", ++test_id); - OptimizerAttrs optimizer_attrs = - OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, - /*momentum=*/0.0f, - /*nesterov=*/false, - /*weight_decay=*/0.001}}; - LocalTrainingBacking local_training_backing = LocalTrainingBacking( - p, worker_procs, allocators, allocated_tensors, gradient_tensor_source, - optimizer_tensor_source, computation_graph, runtime_arg_config, - optimizer_attrs); - execute_update(local_training_backing, linear_operator.layer, optimizer_attrs).wait(); - printf("passed\n"); - } - - { - printf("\nRunning test %d: SGDOptimizerAttrs, momentum=0.9...\n", ++test_id); - OptimizerAttrs optimizer_attrs = - OptimizerAttrs{SGDOptimizerAttrs{/*lr=*/0.001, - /*momentum=*/0.9, - /*nesterov=*/false, - /*weight_decay=*/0.001}}; - LocalTrainingBacking local_training_backing = LocalTrainingBacking( - p, worker_procs, allocators, allocated_tensors, gradient_tensor_source, - optimizer_tensor_source, computation_graph, runtime_arg_config, - optimizer_attrs); - execute_update(local_training_backing, linear_operator.layer, optimizer_attrs).wait(); - printf("passed\n"); - } - - { - printf("\nRunning test %d: AdamOptimizerAttrs...\n", ++test_id); - OptimizerAttrs optimizer_attrs = - OptimizerAttrs{AdamOptimizerAttrs{/*alpha=*/0.001, - /*beta1=*/0.9, - /*beta2=*/0.999, - /*weight_decay=*/0.001, - /*alpha_t=*/0.001, - /*beta_t=*/0.9, - /*beta2_t=*/0.999, - /*epsilon=*/1e-8}}; - LocalTrainingBacking local_training_backing = LocalTrainingBacking( - p, worker_procs, allocators, allocated_tensors, gradient_tensor_source, - optimizer_tensor_source, computation_graph, runtime_arg_config, - optimizer_attrs); - execute_update(local_training_backing, linear_operator.layer, optimizer_attrs).wait(); - printf("passed\n"); - } -} From b21edc6c785bd770787259f11a65a2a7662f51a8 Mon Sep 17 00:00:00 2001 From: Zhuofu Chen <59316330+chenzhuofu@users.noreply.github.com> Date: Fri, 5 Dec 2025 14:32:26 -0500 Subject: [PATCH 91/91] Delete lib/realm-backend/src/realm_training_backing copy.cc --- .../src/realm_training_backing copy.cc | 126 ------------------ 1 file changed, 126 deletions(-) delete mode 100644 lib/realm-backend/src/realm_training_backing copy.cc diff --git a/lib/realm-backend/src/realm_training_backing copy.cc b/lib/realm-backend/src/realm_training_backing copy.cc deleted file mode 100644 index e6a3079a25..0000000000 --- a/lib/realm-backend/src/realm_training_backing copy.cc +++ /dev/null @@ -1,126 +0,0 @@ -// #include "kernels/allocation.h" -// #include "local-execution/loss_functions.h" -// #include "local-execution/optimizer.h" -// #include "pcg/computation_graph.dtg.h" -// #include "pcg/computation_graph.h" -// #include "pcg/optimizer_attrs.h" -// #include "realm-backend/realm_tensor_backing.h" -// #include "task-spec/op_task_to_task_invocation.h" -// #include "task-spec/runtime_arg_config.h" -// #include "task-spec/task_invocation.h" -// #include "task-spec/task_signature_impl.h" -// #include "utils/containers/contains.h" -// #include "utils/containers/contains_key.h" -// #include "utils/containers/get_only.h" -// #include "utils/containers/values.h" -// #include "utils/exception.h" - -// #include "realm-backend/realm_training_backing.h" -// #include "realm-backend/task_result.h" -// #include "realm-backend/task_wrapper.h" - -// namespace FlexFlow { - -// using namespace Realm; - -// RealmTrainingBacking::RealmTrainingBacking( -// Processor master_proc, std::vector const &worker_procs, -// std::vector const &allocators, -// AllocatedTensors const &allocated_tensors, -// GradientTensorSource &gradient_tensor_source, -// ComputationGraph const &computation_graph, -// RuntimeArgConfig const &runtime_arg_config) -// : master_proc(master_proc), master_event(Realm::Event::NO_EVENT), -// master_mem(Machine::MemoryQuery(Machine::get_machine()) -// .only_kind(Memory::SYSTEM_MEM) -// .best_affinity_to(master_proc) -// .first()), -// worker_procs(worker_procs), -// worker_events(std::vector(worker_procs.size(), -// Realm::Event::NO_EVENT)), -// allocators(allocators), computation_graph(computation_graph), -// task_registry(construct_task_registry_and_register_tasks_for_realm( -// computation_graph, worker_procs)), -// realm_tensor_backing(construct_realm_tensor_backing( // TODO: multi gpu -// allocated_tensors, -// generate_unallocated_tensors( -// allocated_tensors, get_all_tensor_attrs(computation_graph), -// gradient_tensor_source), -// this->allocators[0])), -// realm_args_backing(initialize_args_backing(this, computation_graph, runtime_arg_config)) {} - -// TaskRegistry construct_task_registry_and_register_tasks_for_realm( -// ComputationGraph const &cg, std::vector const &worker_procs) { -// TaskRegistry task_registry = construct_task_registry( -// get_layer_attrs_mapping(cg)); - -// // register tasks for realm -// std::unordered_map const &layer_attrs_mapping = -// get_layer_attrs_mapping(cg); -// for (std::pair const &layer_attrs : -// layer_attrs_mapping) { -// ComputationGraphOpAttrs attrs = layer_attrs.second.op_attrs; -// std::vector task_ids = get_task_ids(attrs); -// for (task_id_t task_id : task_ids) { -// TaskSignatureAndImpl task_signature_impl = get_task_sig_impl(task_id); -// // TODO: multi gpu -// register_wrapper_tasks(0, worker_procs[0], task_id, task_signature_impl); -// } -// } - -// return task_registry; -// } - -// RealmArgsBacking -// initialize_args_backing(RealmTrainingBacking *backing, -// ComputationGraph const &cg, -// RuntimeArgConfig const &runtime_arg_config) { -// std::unordered_map -// per_device_op_states; -// TaskRegistry const &task_registry = backing->task_registry; -// RealmTensorBacking const &realm_tensor_backing = -// backing->realm_tensor_backing; -// Processor master_proc = backing->master_proc; -// Memory master_mem = backing->master_mem; -// std::vector &worker_procs = backing->worker_procs; -// std::vector &worker_events = backing->worker_events; -// // TODO: multi gpu -// Allocator &allocator = backing->allocators[0]; - -// for (layer_guid_t const &node : topological_ordering(cg)) { -// if (registry_contains_task_for_layer(task_registry, node, -// OpTaskType::INIT)) { -// ComputationGraphOpAttrs attrs = get_layer_attrs(cg, node).op_attrs; - -// TaskInvocation invocation = lower_to_task_invocation( -// init(attrs), node, get_incoming_inputs(cg, node), -// get_incoming_input_shapes(cg, node), get_outgoing_tensors(cg, node), -// get_incoming_weights(cg, node), -// realm_tensor_backing.tensor_gradient_mapping, std::nullopt); -// TaskArgumentAccessor accessor = get_task_arg_accessor( -// realm_tensor_backing, -// make_args_backing_with_empty_device_states(runtime_arg_config), -// invocation, -// allocator); -// task_id_t task_id = invocation.task_id; -// TaskImplFunction impl_function = -// task_registry.task_mapping.at(task_id).impl_function; -// // TODO: multi gpu launching -// Promise promise = Promise(); -// Future future = promise.get_future(); -// RealmTaskArgs* task_arg = new RealmTaskArgs{ -// task_id, impl_function, accessor, std::move(promise)}; -// uintptr_t args[1] = {reinterpret_cast(task_arg)}; -// Event e = -// worker_procs[0].spawn(get_realm_task_id(task_id), -// args, sizeof(uintptr_t), worker_events[0]); -// worker_events[0] = e; -// future.set_event(e); -// per_device_op_states.insert({node, future.get().value()}); -// } -// } - -// return RealmArgsBacking{runtime_arg_config, per_device_op_states}; -// } - -// } // namespace FlexFlow