From 10accf56d959f529d87acc303c50311b96c869c4 Mon Sep 17 00:00:00 2001
From: Cory Cornelius <cory.cornelius@intel.com>
Date: Fri, 9 Jun 2023 13:58:22 -0700
Subject: [PATCH 01/23] Use attrgetter

---
 mart/callbacks/visualizer.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/mart/callbacks/visualizer.py b/mart/callbacks/visualizer.py
index 8c34b879..a81a94b7 100644
--- a/mart/callbacks/visualizer.py
+++ b/mart/callbacks/visualizer.py
@@ -4,12 +4,10 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 
-import os
+from operator import attrgetter
 
 from pytorch_lightning.callbacks import Callback
 
-import mart
-
 __all__ = ["ImageVisualizer"]
 
 
@@ -29,7 +27,7 @@ def log_image(self, trainer, tag, image):
 
     def log_images(self, trainer, pl_module):
         for tag, path in self.tag_paths.items():
-            image = mart.utils.get_object(pl_module, path)
+            image = attrgetter(path)(pl_module)
             self.log_image(trainer, tag, image)
 
     def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):

From d8fe8a08a83b752ad777d88a952ba24c18ddeaa5 Mon Sep 17 00:00:00 2001
From: Cory Cornelius <cory.cornelius@intel.com>
Date: Fri, 9 Jun 2023 13:59:21 -0700
Subject: [PATCH 02/23] Restore image_visualizer config

---
 mart/configs/callbacks/image_visualizer.yaml | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 mart/configs/callbacks/image_visualizer.yaml

diff --git a/mart/configs/callbacks/image_visualizer.yaml b/mart/configs/callbacks/image_visualizer.yaml
new file mode 100644
index 00000000..65b9f8dd
--- /dev/null
+++ b/mart/configs/callbacks/image_visualizer.yaml
@@ -0,0 +1,3 @@
+image_visualizer:
+  _target_: mart.callbacks.PerturbedImageVisualizer
+  folder: ${paths.output_dir}/adversarial_examples

From 48e3be991b263c29d9934caeed9e899cf8286bde Mon Sep 17 00:00:00 2001
From: Cory Cornelius <cory.cornelius@intel.com>
Date: Mon, 12 Jun 2023 16:05:47 -0700
Subject: [PATCH 03/23] Make *_step_log dicts where the key is the logging name
 and value is the output key

---
 mart/models/modular.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/mart/models/modular.py b/mart/models/modular.py
index eb5dd934..1fe4be99 100644
--- a/mart/models/modular.py
+++ b/mart/models/modular.py
@@ -70,13 +70,13 @@ def __init__(
 
         self.lr_scheduler = lr_scheduler
 
-        self.training_step_log = training_step_log or ["loss"]
+        self.training_step_log = training_step_log or {}
         self.training_metrics = training_metrics
 
-        self.validation_step_log = validation_step_log or []
+        self.validation_step_log = validation_step_log or {}
         self.validation_metrics = validation_metrics
 
-        self.test_step_log = test_step_log or []
+        self.test_step_log = test_step_log or {}
         self.test_metrics = test_metrics
 
         # Load state dict for specified modules. We flatten it because Hydra
@@ -115,8 +115,8 @@ def training_step(self, batch, batch_idx):
         input, target = batch
         output = self(input=input, target=target, model=self.model, step="training")
 
-        for name in self.training_step_log:
-            self.log(f"training/{name}", output[name])
+        for log_name, output_key in self.training_step_log.items():
+            self.log(f"training/{log_name}", output[output_key], sync_dist=True)
 
         assert "loss" in output
         return output
@@ -149,8 +149,8 @@ def validation_step(self, batch, batch_idx):
         input, target = batch
         output = self(input=input, target=target, model=self.model, step="validation")
 
-        for name in self.validation_step_log:
-            self.log(f"validation/{name}", output[name])
+        for log_name, output_key in self.validation_step_log.items():
+            self.log(f"validation/{log_name}", output[output_key], sync_dist=True)
 
         return output
 
@@ -175,8 +175,8 @@ def test_step(self, batch, batch_idx):
         input, target = batch
         output = self(input=input, target=target, model=self.model, step="test")
 
-        for name in self.test_step_log:
-            self.log(f"test/{name}", output[name])
+        for log_name, output_key in self.test_step_log.items():
+            self.log(f"test/{log_name}", output[output_key], sync_dist=True)
 
         return output
 

From 01a20664b2011c20727d1158b1b7476dc9d44779 Mon Sep 17 00:00:00 2001
From: Cory Cornelius <cory.cornelius@intel.com>
Date: Tue, 13 Jun 2023 13:57:49 -0700
Subject: [PATCH 04/23] Fix configs

---
 mart/configs/model/torchvision_faster_rcnn.yaml      | 11 ++++-------
 mart/configs/model/torchvision_object_detection.yaml |  3 ++-
 mart/configs/model/torchvision_retinanet.yaml        |  4 +++-
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/mart/configs/model/torchvision_faster_rcnn.yaml b/mart/configs/model/torchvision_faster_rcnn.yaml
index c5237184..bc0ce228 100644
--- a/mart/configs/model/torchvision_faster_rcnn.yaml
+++ b/mart/configs/model/torchvision_faster_rcnn.yaml
@@ -4,13 +4,10 @@ defaults:
 
 # log all losses separately in training.
 training_step_log:
-  [
-    "rpn_loss.loss_objectness",
-    "rpn_loss.loss_rpn_box_reg",
-    "box_loss.loss_classifier",
-    "box_loss.loss_box_reg",
-    "loss",
-  ]
+  rpn_loss_objectness: "rpn_loss.loss_objectness"
+  rpn_loss_rpn_box_reg: "rpn_loss.loss_rpn_box_reg"
+  box_loss_classifier: "box_loss.loss_classifier"
+  box_loss_box_reg: "box_loss.loss_box_reg"
 
 training_sequence:
   seq010:
diff --git a/mart/configs/model/torchvision_object_detection.yaml b/mart/configs/model/torchvision_object_detection.yaml
index a1495dad..c81930a8 100644
--- a/mart/configs/model/torchvision_object_detection.yaml
+++ b/mart/configs/model/torchvision_object_detection.yaml
@@ -3,7 +3,8 @@ defaults:
   - modular
   - /model/modules@modules.preprocessor: tuple_normalizer
 
-training_step_log: ???
+training_step_log:
+  loss: "loss"
 
 training_sequence: ???
 
diff --git a/mart/configs/model/torchvision_retinanet.yaml b/mart/configs/model/torchvision_retinanet.yaml
index 4c45917c..695263a2 100644
--- a/mart/configs/model/torchvision_retinanet.yaml
+++ b/mart/configs/model/torchvision_retinanet.yaml
@@ -3,7 +3,9 @@ defaults:
   - torchvision_object_detection
 
 # log all losses separately in training.
-training_step_log: ["loss_classifier", "loss_box_reg"]
+training_step_log:
+  loss_classifier: "loss_classifier"
+  loss_box_reg: "loss_box_reg"
 
 training_sequence:
   - preprocessor: ["input"]

From df1d0b266483ce62fc076573a94449574d082a6c Mon Sep 17 00:00:00 2001
From: Cory Cornelius <cory.cornelius@intel.com>
Date: Tue, 13 Jun 2023 13:57:58 -0700
Subject: [PATCH 05/23] remove sync_dist

---
 mart/models/modular.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mart/models/modular.py b/mart/models/modular.py
index 1fe4be99..e09f2c9d 100644
--- a/mart/models/modular.py
+++ b/mart/models/modular.py
@@ -116,7 +116,7 @@ def training_step(self, batch, batch_idx):
         output = self(input=input, target=target, model=self.model, step="training")
 
         for log_name, output_key in self.training_step_log.items():
-            self.log(f"training/{log_name}", output[output_key], sync_dist=True)
+            self.log(f"training/{log_name}", output[output_key])
 
         assert "loss" in output
         return output
@@ -150,7 +150,7 @@ def validation_step(self, batch, batch_idx):
         output = self(input=input, target=target, model=self.model, step="validation")
 
         for log_name, output_key in self.validation_step_log.items():
-            self.log(f"validation/{log_name}", output[output_key], sync_dist=True)
+            self.log(f"validation/{log_name}", output[output_key])
 
         return output
 
@@ -176,7 +176,7 @@ def test_step(self, batch, batch_idx):
         output = self(input=input, target=target, model=self.model, step="test")
 
         for log_name, output_key in self.test_step_log.items():
-            self.log(f"test/{log_name}", output[output_key], sync_dist=True)
+            self.log(f"test/{log_name}", output[output_key])
 
         return output
 

From 14f4d1fa9f437849655e820f287885864939bf07 Mon Sep 17 00:00:00 2001
From: Cory Cornelius <cory.cornelius@intel.com>
Date: Tue, 13 Jun 2023 14:05:53 -0700
Subject: [PATCH 06/23] backwards compatibility

---
 mart/models/modular.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/mart/models/modular.py b/mart/models/modular.py
index e09f2c9d..b24ce6ae 100644
--- a/mart/models/modular.py
+++ b/mart/models/modular.py
@@ -70,12 +70,21 @@ def __init__(
 
         self.lr_scheduler = lr_scheduler
 
+        # Be backwards compatible by turning list into dict where each item is its own key-value
+        if isinstance(training_step_log, (list, tuple)):
+            training_step_log = { item: item for item in training_step_log}
         self.training_step_log = training_step_log or {}
         self.training_metrics = training_metrics
 
+        # Be backwards compatible by turning list into dict where each item is its own key-value
+        if isinstance(validation_step_log, (list, tuple)):
+            validation_step_log = { item: item for item in validation_step_log}
         self.validation_step_log = validation_step_log or {}
         self.validation_metrics = validation_metrics
 
+        # Be backwards compatible by turning list into dict where each item is its own key-value
+        if isinstance(test_step_log, (list, tuple)):
+            test_step_log = { item: item for item in test_step_log}
         self.test_step_log = test_step_log or {}
         self.test_metrics = test_metrics
 

From 2e30587274d99afe8f8dc255da44af68a30a05ef Mon Sep 17 00:00:00 2001
From: Cory Cornelius <cory.cornelius@intel.com>
Date: Tue, 13 Jun 2023 14:06:03 -0700
Subject: [PATCH 07/23] Revert "Fix configs"

This reverts commit 01a20664b2011c20727d1158b1b7476dc9d44779.
---
 mart/configs/model/torchvision_faster_rcnn.yaml      | 11 +++++++----
 mart/configs/model/torchvision_object_detection.yaml |  3 +--
 mart/configs/model/torchvision_retinanet.yaml        |  4 +---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/mart/configs/model/torchvision_faster_rcnn.yaml b/mart/configs/model/torchvision_faster_rcnn.yaml
index bc0ce228..c5237184 100644
--- a/mart/configs/model/torchvision_faster_rcnn.yaml
+++ b/mart/configs/model/torchvision_faster_rcnn.yaml
@@ -4,10 +4,13 @@ defaults:
 
 # log all losses separately in training.
 training_step_log:
-  rpn_loss_objectness: "rpn_loss.loss_objectness"
-  rpn_loss_rpn_box_reg: "rpn_loss.loss_rpn_box_reg"
-  box_loss_classifier: "box_loss.loss_classifier"
-  box_loss_box_reg: "box_loss.loss_box_reg"
+  [
+    "rpn_loss.loss_objectness",
+    "rpn_loss.loss_rpn_box_reg",
+    "box_loss.loss_classifier",
+    "box_loss.loss_box_reg",
+    "loss",
+  ]
 
 training_sequence:
   seq010:
diff --git a/mart/configs/model/torchvision_object_detection.yaml b/mart/configs/model/torchvision_object_detection.yaml
index c81930a8..a1495dad 100644
--- a/mart/configs/model/torchvision_object_detection.yaml
+++ b/mart/configs/model/torchvision_object_detection.yaml
@@ -3,8 +3,7 @@ defaults:
   - modular
   - /model/modules@modules.preprocessor: tuple_normalizer
 
-training_step_log:
-  loss: "loss"
+training_step_log: ???
 
 training_sequence: ???
 
diff --git a/mart/configs/model/torchvision_retinanet.yaml b/mart/configs/model/torchvision_retinanet.yaml
index 695263a2..4c45917c 100644
--- a/mart/configs/model/torchvision_retinanet.yaml
+++ b/mart/configs/model/torchvision_retinanet.yaml
@@ -3,9 +3,7 @@ defaults:
   - torchvision_object_detection
 
 # log all losses separately in training.
-training_step_log:
-  loss_classifier: "loss_classifier"
-  loss_box_reg: "loss_box_reg"
+training_step_log: ["loss_classifier", "loss_box_reg"]
 
 training_sequence:
   - preprocessor: ["input"]

From 6fef148cb94379eeb4cef983080b90740e5f0bc1 Mon Sep 17 00:00:00 2001
From: Cory Cornelius <cory.cornelius@intel.com>
Date: Tue, 13 Jun 2023 14:08:48 -0700
Subject: [PATCH 08/23] style

---
 mart/models/modular.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mart/models/modular.py b/mart/models/modular.py
index b24ce6ae..d1d2752c 100644
--- a/mart/models/modular.py
+++ b/mart/models/modular.py
@@ -72,19 +72,19 @@ def __init__(
 
         # Be backwards compatible by turning list into dict where each item is its own key-value
         if isinstance(training_step_log, (list, tuple)):
-            training_step_log = { item: item for item in training_step_log}
+            training_step_log = {item: item for item in training_step_log}
         self.training_step_log = training_step_log or {}
         self.training_metrics = training_metrics
 
         # Be backwards compatible by turning list into dict where each item is its own key-value
         if isinstance(validation_step_log, (list, tuple)):
-            validation_step_log = { item: item for item in validation_step_log}
+            validation_step_log = {item: item for item in validation_step_log}
         self.validation_step_log = validation_step_log or {}
         self.validation_metrics = validation_metrics
 
         # Be backwards compatible by turning list into dict where each item is its own key-value
         if isinstance(test_step_log, (list, tuple)):
-            test_step_log = { item: item for item in test_step_log}
+            test_step_log = {item: item for item in test_step_log}
         self.test_step_log = test_step_log or {}
         self.test_metrics = test_metrics
 

From c4e0d78813a85fc7f6176ce67def781635b7b3de Mon Sep 17 00:00:00 2001
From: Cory Cornelius <cory.cornelius@intel.com>
Date: Mon, 12 Jun 2023 16:04:53 -0700
Subject: [PATCH 09/23] Make metric logging keys configurable

---
 mart/models/modular.py | 37 ++++++++++++++++---------------------
 1 file changed, 16 insertions(+), 21 deletions(-)

diff --git a/mart/models/modular.py b/mart/models/modular.py
index eb5dd934..4fcfe783 100644
--- a/mart/models/modular.py
+++ b/mart/models/modular.py
@@ -36,6 +36,9 @@ def __init__(
         test_step_log=None,
         test_metrics=None,
         load_state_dict=None,
+        output_loss_key="loss",
+        output_preds_key="preds",
+        output_target_key="target",
     ):
         super().__init__()
 
@@ -88,6 +91,10 @@ def __init__(
             logger.info(f"Loading state_dict {path} for {module.__class__.__name__}...")
             module.load_state_dict(torch.load(path, map_location="cpu"))
 
+        self.output_loss_key = output_loss_key
+        self.output_preds_key = output_preds_key
+        self.output_target_key = output_target_key
+
     def configure_optimizers(self):
         config = {}
         config["optimizer"] = self.optimizer_fn(self.model)
@@ -118,19 +125,15 @@ def training_step(self, batch, batch_idx):
         for name in self.training_step_log:
             self.log(f"training/{name}", output[name])
 
-        assert "loss" in output
-        return output
-
-    def training_step_end(self, output):
         if self.training_metrics is not None:
             # Some models only return loss in the training mode.
-            if "preds" not in output or "target" not in output:
+            if self.output_preds_key not in output or self.output_target_key not in output:
                 raise ValueError(
-                    "You have specified training_metrics, but the model does not return preds and target during training. You can either nullify training_metrics or configure the model to return preds and target in the training output."
+                    f"You have specified training_metrics, but the model does not return {self.output_preds_key} or {self.output_target_key} during training. You can either nullify training_metrics or configure the model to return {self.output_preds_key} and {self.output_target_key} in the training output."
                 )
-            self.training_metrics(output["preds"], output["target"])
-        loss = output.pop("loss")
-        return loss
+            self.training_metrics(output[self.output_preds_key], output[self.output_target_key])
+
+        return output[self.output_loss_key]
 
     def training_epoch_end(self, outputs):
         if self.training_metrics is not None:
@@ -152,13 +155,9 @@ def validation_step(self, batch, batch_idx):
         for name in self.validation_step_log:
             self.log(f"validation/{name}", output[name])
 
-        return output
+        self.validation_metrics(output[self.output_preds_key], output[self.output_target_key])
 
-    def validation_step_end(self, output):
-        self.validation_metrics(output["preds"], output["target"])
-
-        # I don't know why this is required to prevent CUDA memory leak in validaiton and test. (Not required in training.)
-        output.clear()
+        return None
 
     def validation_epoch_end(self, outputs):
         metrics = self.validation_metrics.compute()
@@ -178,13 +177,9 @@ def test_step(self, batch, batch_idx):
         for name in self.test_step_log:
             self.log(f"test/{name}", output[name])
 
-        return output
-
-    def test_step_end(self, output):
-        self.test_metrics(output["preds"], output["target"])
+        self.test_metrics(output[self.output_preds_key], output[self.output_target_key])
 
-        # I don't know why this is required to prevent CUDA memory leak in validaiton and test. (Not required in training.)
-        output.clear()
+        return None
 
     def test_epoch_end(self, outputs):
         metrics = self.test_metrics.compute()

From 508798ca12d98d2ce757bcb18779b7c3fe474cdd Mon Sep 17 00:00:00 2001
From: Cory Cornelius <cory.cornelius@intel.com>
Date: Tue, 13 Jun 2023 14:15:36 -0700
Subject: [PATCH 10/23] cleanup

---
 mart/models/modular.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/mart/models/modular.py b/mart/models/modular.py
index 4fcfe783..d663dda3 100644
--- a/mart/models/modular.py
+++ b/mart/models/modular.py
@@ -125,6 +125,10 @@ def training_step(self, batch, batch_idx):
         for name in self.training_step_log:
             self.log(f"training/{name}", output[name])
 
+        assert "loss" in output
+        return output
+
+    def training_step_end(self, output):
         if self.training_metrics is not None:
             # Some models only return loss in the training mode.
             if self.output_preds_key not in output or self.output_target_key not in output:
@@ -132,8 +136,8 @@ def training_step(self, batch, batch_idx):
                     f"You have specified training_metrics, but the model does not return {self.output_preds_key} or {self.output_target_key} during training. You can either nullify training_metrics or configure the model to return {self.output_preds_key} and {self.output_target_key} in the training output."
                 )
             self.training_metrics(output[self.output_preds_key], output[self.output_target_key])
-
-        return output[self.output_loss_key]
+        loss = output.pop(self.output_loss_key)
+        return loss
 
     def training_epoch_end(self, outputs):
         if self.training_metrics is not None:
@@ -155,9 +159,13 @@ def validation_step(self, batch, batch_idx):
         for name in self.validation_step_log:
             self.log(f"validation/{name}", output[name])
 
+        return output
+
+    def validation_step_end(self, output):
         self.validation_metrics(output[self.output_preds_key], output[self.output_target_key])
 
-        return None
+        # I don't know why this is required to prevent CUDA memory leak in validaiton and test. (Not required in training.)
+        output.clear()
 
     def validation_epoch_end(self, outputs):
         metrics = self.validation_metrics.compute()
@@ -177,9 +185,13 @@ def test_step(self, batch, batch_idx):
         for name in self.test_step_log:
             self.log(f"test/{name}", output[name])
 
+        return output
+
+    def test_step_end(self, output):
         self.test_metrics(output[self.output_preds_key], output[self.output_target_key])
 
-        return None
+        # I don't know why this is required to prevent CUDA memory leak in validaiton and test. (Not required in training.)
+        output.clear()
 
     def test_epoch_end(self, outputs):
         metrics = self.test_metrics.compute()

From fc770e81d7783edf7e0ef7bf04b4b12a25a2eaa0 Mon Sep 17 00:00:00 2001
From: Cory Cornelius <cory.cornelius@intel.com>
Date: Tue, 13 Jun 2023 17:12:51 -0700
Subject: [PATCH 11/23] Remove *_step_end

---
 mart/models/modular.py | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/mart/models/modular.py b/mart/models/modular.py
index d663dda3..4fcfe783 100644
--- a/mart/models/modular.py
+++ b/mart/models/modular.py
@@ -125,10 +125,6 @@ def training_step(self, batch, batch_idx):
         for name in self.training_step_log:
             self.log(f"training/{name}", output[name])
 
-        assert "loss" in output
-        return output
-
-    def training_step_end(self, output):
         if self.training_metrics is not None:
             # Some models only return loss in the training mode.
             if self.output_preds_key not in output or self.output_target_key not in output:
@@ -136,8 +132,8 @@ def training_step_end(self, output):
                     f"You have specified training_metrics, but the model does not return {self.output_preds_key} or {self.output_target_key} during training. You can either nullify training_metrics or configure the model to return {self.output_preds_key} and {self.output_target_key} in the training output."
                 )
             self.training_metrics(output[self.output_preds_key], output[self.output_target_key])
-        loss = output.pop(self.output_loss_key)
-        return loss
+
+        return output[self.output_loss_key]
 
     def training_epoch_end(self, outputs):
         if self.training_metrics is not None:
@@ -159,13 +155,9 @@ def validation_step(self, batch, batch_idx):
         for name in self.validation_step_log:
             self.log(f"validation/{name}", output[name])
 
-        return output
-
-    def validation_step_end(self, output):
         self.validation_metrics(output[self.output_preds_key], output[self.output_target_key])
 
-        # I don't know why this is required to prevent CUDA memory leak in validaiton and test. (Not required in training.)
-        output.clear()
+        return None
 
     def validation_epoch_end(self, outputs):
         metrics = self.validation_metrics.compute()
@@ -185,13 +177,9 @@ def test_step(self, batch, batch_idx):
         for name in self.test_step_log:
             self.log(f"test/{name}", output[name])
 
-        return output
-
-    def test_step_end(self, output):
         self.test_metrics(output[self.output_preds_key], output[self.output_target_key])
 
-        # I don't know why this is required to prevent CUDA memory leak in validaiton and test. (Not required in training.)
-        output.clear()
+        return None
 
     def test_epoch_end(self, outputs):
         metrics = self.test_metrics.compute()

From c31f4deddb65f6008f8c570bafd3ed2bf1c33d77 Mon Sep 17 00:00:00 2001
From: Cory Cornelius <cory.cornelius@intel.com>
Date: Mon, 12 Jun 2023 15:56:55 -0700
Subject: [PATCH 12/23] Don't require output module with SequentialDict

---
 mart/nn/nn.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/mart/nn/nn.py b/mart/nn/nn.py
index 93b0f07f..754e8657 100644
--- a/mart/nn/nn.py
+++ b/mart/nn/nn.py
@@ -49,10 +49,6 @@ class SequentialDict(torch.nn.ModuleDict):
     """
 
     def __init__(self, modules, sequences=None):
-
-        if "output" not in modules:
-            raise ValueError("Modules must have an module named 'output'")
-
         super().__init__(modules)
 
         self._sequences = {
@@ -121,7 +117,8 @@ def forward(self, step=None, sequence=None, **kwargs):
             # Pop the executed module to proceed with the sequence
             sequence.popitem(last=False)
 
-        return kwargs["output"]
+        # return kwargs as DotDict
+        return DotDict(kwargs)
 
 
 class ReturnKwargs(torch.nn.Module):

From 549f705c3a71e58cb014fd3ffe564f717580578a Mon Sep 17 00:00:00 2001
From: Cory Cornelius <cory.cornelius@intel.com>
Date: Tue, 13 Jun 2023 17:42:07 -0700
Subject: [PATCH 13/23] fix configs and tests

---
 .../attack/gain/rcnn_training_loss.yaml       |  8 ++--
 mart/configs/model/classifier.yaml            | 17 -------
 .../model/torchvision_faster_rcnn.yaml        | 46 ++-----------------
 .../model/torchvision_object_detection.yaml   | 10 ++--
 mart/configs/model/torchvision_retinanet.yaml | 27 ++---------
 tests/test_experiments.py                     |  6 +--
 6 files changed, 18 insertions(+), 96 deletions(-)

diff --git a/mart/configs/attack/gain/rcnn_training_loss.yaml b/mart/configs/attack/gain/rcnn_training_loss.yaml
index eb7abb9c..9ed8671b 100644
--- a/mart/configs/attack/gain/rcnn_training_loss.yaml
+++ b/mart/configs/attack/gain/rcnn_training_loss.yaml
@@ -2,8 +2,8 @@ _target_: mart.nn.CallWith
 module:
   _target_: mart.nn.Sum
 arg_keys:
-  - rpn_loss.loss_objectness
-  - rpn_loss.loss_rpn_box_reg
-  - box_loss.loss_classifier
-  - box_loss.loss_box_reg
+  - "losses_and_detections.training.loss_objectness"
+  - "losses_and_detections.training.loss_rpn_box_reg"
+  - "losses_and_detections.training.loss_classifier"
+  - "losses_and_detections.training.loss_box_reg"
 kwarg_keys: null
diff --git a/mart/configs/model/classifier.yaml b/mart/configs/model/classifier.yaml
index ad664989..df1a9c5b 100644
--- a/mart/configs/model/classifier.yaml
+++ b/mart/configs/model/classifier.yaml
@@ -17,14 +17,6 @@ training_sequence:
   seq040:
     preds:
       _call_with_args_: ["logits"]
-  seq050:
-    output:
-      {
-        "preds": "preds",
-        "target": "target",
-        "logits": "logits",
-        "loss": "loss",
-      }
 
 # The kwargs-centric version.
 # We may use *args as **kwargs to avoid the lengthy _call_with_args_.
@@ -36,10 +28,6 @@ validation_sequence:
   - logits: ["preprocessor"]
   - preds:
       input: logits
-  - output:
-      preds: preds
-      target: target
-      logits: logits
 
 # The simplified version.
 #   We treat a list as the `_call_with_args_` parameter.
@@ -50,8 +38,6 @@ test_sequence:
     logits: ["preprocessor"]
   seq030:
     preds: ["logits"]
-  seq040:
-    output: { preds: preds, target: target, logits: logits }
 
 modules:
   preprocessor: ???
@@ -64,6 +50,3 @@ modules:
   preds:
     _target_: torch.nn.Softmax
     dim: 1
-
-  output:
-    _target_: mart.nn.ReturnKwargs
diff --git a/mart/configs/model/torchvision_faster_rcnn.yaml b/mart/configs/model/torchvision_faster_rcnn.yaml
index c5237184..65200579 100644
--- a/mart/configs/model/torchvision_faster_rcnn.yaml
+++ b/mart/configs/model/torchvision_faster_rcnn.yaml
@@ -4,13 +4,10 @@ defaults:
 
 # log all losses separately in training.
 training_step_log:
-  [
-    "rpn_loss.loss_objectness",
-    "rpn_loss.loss_rpn_box_reg",
-    "box_loss.loss_classifier",
-    "box_loss.loss_box_reg",
-    "loss",
-  ]
+  loss_objectness: "losses_and_detections.training.loss_objectness"
+  loss_rpn_box_reg: "losses_and_detections.training.loss_rpn_box_reg"
+  loss_classifier: "losses_and_detections.training.loss_classifier"
+  loss_box_reg: "losses_and_detections.training.loss_box_reg"
 
 training_sequence:
   seq010:
@@ -29,19 +26,6 @@ training_sequence:
         "losses_and_detections.training.loss_box_reg",
       ]
 
-  seq040:
-    output:
-      # Output all losses for logging, defined in model.training_step_log
-      {
-        "preds": "losses_and_detections.eval",
-        "target": "target",
-        "loss": "loss",
-        "rpn_loss.loss_objectness": "losses_and_detections.training.loss_objectness",
-        "rpn_loss.loss_rpn_box_reg": "losses_and_detections.training.loss_rpn_box_reg",
-        "box_loss.loss_classifier": "losses_and_detections.training.loss_classifier",
-        "box_loss.loss_box_reg": "losses_and_detections.training.loss_box_reg",
-      }
-
 validation_sequence:
   seq010:
     preprocessor: ["input"]
@@ -49,17 +33,6 @@ validation_sequence:
   seq020:
     losses_and_detections: ["preprocessor", "target"]
 
-  seq030:
-    output:
-      {
-        "preds": "losses_and_detections.eval",
-        "target": "target",
-        "rpn_loss.loss_objectness": "losses_and_detections.training.loss_objectness",
-        "rpn_loss.loss_rpn_box_reg": "losses_and_detections.training.loss_rpn_box_reg",
-        "box_loss.loss_classifier": "losses_and_detections.training.loss_classifier",
-        "box_loss.loss_box_reg": "losses_and_detections.training.loss_box_reg",
-      }
-
 test_sequence:
   seq010:
     preprocessor: ["input"]
@@ -67,17 +40,6 @@ test_sequence:
   seq020:
     losses_and_detections: ["preprocessor", "target"]
 
-  seq030:
-    output:
-      {
-        "preds": "losses_and_detections.eval",
-        "target": "target",
-        "rpn_loss.loss_objectness": "losses_and_detections.training.loss_objectness",
-        "rpn_loss.loss_rpn_box_reg": "losses_and_detections.training.loss_rpn_box_reg",
-        "box_loss.loss_classifier": "losses_and_detections.training.loss_classifier",
-        "box_loss.loss_box_reg": "losses_and_detections.training.loss_box_reg",
-      }
-
 modules:
   losses_and_detections:
     # 17s: DualModeGeneralizedRCNN
diff --git a/mart/configs/model/torchvision_object_detection.yaml b/mart/configs/model/torchvision_object_detection.yaml
index a1495dad..1bbd678c 100644
--- a/mart/configs/model/torchvision_object_detection.yaml
+++ b/mart/configs/model/torchvision_object_detection.yaml
@@ -3,14 +3,15 @@ defaults:
   - modular
   - /model/modules@modules.preprocessor: tuple_normalizer
 
-training_step_log: ???
+training_step_log:
+  loss: "loss"
 
 training_sequence: ???
-
 validation_sequence: ???
-
 test_sequence: ???
 
+output_preds_key: "losses_and_detections.eval"
+
 modules:
   losses_and_detections:
     # Return losses in the training mode and predictions in the eval mode in one pass.
@@ -19,6 +20,3 @@ modules:
 
   loss:
     _target_: mart.nn.Sum
-
-  output:
-    _target_: mart.nn.ReturnKwargs
diff --git a/mart/configs/model/torchvision_retinanet.yaml b/mart/configs/model/torchvision_retinanet.yaml
index 4c45917c..34b66945 100644
--- a/mart/configs/model/torchvision_retinanet.yaml
+++ b/mart/configs/model/torchvision_retinanet.yaml
@@ -3,7 +3,9 @@ defaults:
   - torchvision_object_detection
 
 # log all losses separately in training.
-training_step_log: ["loss_classifier", "loss_box_reg"]
+training_step_log:
+  loss_classifier: "losses_and_detections.training.classification"
+  loss_box_reg: "losses_and_detections.training.bbox_regression"
 
 training_sequence:
   - preprocessor: ["input"]
@@ -14,37 +16,14 @@ training_sequence:
         "losses_and_detections.training.classification",
         "losses_and_detections.training.bbox_regression",
       ]
-  - output:
-      # Output all losses for logging, defined in model.training_step_log
-      {
-        "preds": "losses_and_detections.eval",
-        "target": "target",
-        "loss": "loss",
-        "loss_classifier": "losses_and_detections.training.classification",
-        "loss_box_reg": "losses_and_detections.training.bbox_regression",
-      }
 
 validation_sequence:
   - preprocessor: ["input"]
   - losses_and_detections: ["preprocessor", "target"]
-  - output:
-      {
-        "preds": "losses_and_detections.eval",
-        "target": "target",
-        "loss_classifier": "losses_and_detections.training.classification",
-        "loss_box_reg": "losses_and_detections.training.bbox_regression",
-      }
 
 test_sequence:
   - preprocessor: ["input"]
   - losses_and_detections: ["preprocessor", "target"]
-  - output:
-      {
-        "preds": "losses_and_detections.eval",
-        "target": "target",
-        "loss_classifier": "losses_and_detections.training.classification",
-        "loss_box_reg": "losses_and_detections.training.bbox_regression",
-      }
 
 modules:
   losses_and_detections:
diff --git a/tests/test_experiments.py b/tests/test_experiments.py
index d128c1df..cf4ffea7 100644
--- a/tests/test_experiments.py
+++ b/tests/test_experiments.py
@@ -209,7 +209,7 @@ def test_coco_fasterrcnn_experiment(coco_cfg, tmp_path):
         "-m",
         "experiment=COCO_TorchvisionFasterRCNN",
         "hydra.sweep.dir=" + str(tmp_path),
-        "optimized_metric=training/rpn_loss.loss_objectness",
+        "optimized_metric=training/loss_objectness",
     ] + overrides
     run_sh_command(command)
 
@@ -224,7 +224,7 @@ def test_coco_fasterrcnn_adv_experiment(coco_cfg, tmp_path):
         "-m",
         "experiment=COCO_TorchvisionFasterRCNN_Adv",
         "hydra.sweep.dir=" + str(tmp_path),
-        "optimized_metric=training/rpn_loss.loss_objectness",
+        "optimized_metric=training/loss_objectness",
     ] + overrides
     run_sh_command(command)
 
@@ -256,7 +256,7 @@ def test_armory_carla_fasterrcnn_experiment(carla_cfg, tmp_path):
         "experiment=ArmoryCarlaOverObjDet_TorchvisionFasterRCNN",
         "+attack@model.modules.input_adv_test=object_detection_mask_adversary",
         "hydra.sweep.dir=" + str(tmp_path),
-        "optimized_metric=training/rpn_loss.loss_objectness",
+        "optimized_metric=training/loss_objectness",
     ] + overrides
     run_sh_command(command)
 

From 5e7381743d018ab27d3ed61fb033244192b2ebbf Mon Sep 17 00:00:00 2001
From: Cory Cornelius <cory.cornelius@intel.com>
Date: Tue, 13 Jun 2023 17:55:33 -0700
Subject: [PATCH 14/23] Generalize attack objectives

---
 mart/configs/attack/objective/misclassification.yaml       | 4 ++--
 mart/configs/attack/objective/object_detection_missed.yaml | 2 +-
 mart/configs/attack/objective/zero_ap.yaml                 | 4 ++--
 mart/configs/model/modular.yaml                            | 3 +++
 4 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/mart/configs/attack/objective/misclassification.yaml b/mart/configs/attack/objective/misclassification.yaml
index e2e9b819..82e055cd 100644
--- a/mart/configs/attack/objective/misclassification.yaml
+++ b/mart/configs/attack/objective/misclassification.yaml
@@ -2,6 +2,6 @@ _target_: mart.nn.CallWith
 module:
   _target_: mart.attack.objective.Mispredict
 arg_keys:
-  - preds
-  - target
+  - ${model.output_preds_key}
+  - ${model.output_target_key}
 kwarg_keys: null
diff --git a/mart/configs/attack/objective/object_detection_missed.yaml b/mart/configs/attack/objective/object_detection_missed.yaml
index dec2410c..7ebb1dc3 100644
--- a/mart/configs/attack/objective/object_detection_missed.yaml
+++ b/mart/configs/attack/objective/object_detection_missed.yaml
@@ -3,5 +3,5 @@ module:
   _target_: mart.attack.objective.Missed
   confidence_threshold: 0.0
 arg_keys:
-  - preds
+  - ${model.output_preds_key}
 kwarg_keys: null
diff --git a/mart/configs/attack/objective/zero_ap.yaml b/mart/configs/attack/objective/zero_ap.yaml
index 6a43f77d..91dc5b96 100644
--- a/mart/configs/attack/objective/zero_ap.yaml
+++ b/mart/configs/attack/objective/zero_ap.yaml
@@ -4,6 +4,6 @@ module:
   iou_threshold: 0.5
   confidence_threshold: 0.0
 arg_keys:
-  - preds
-  - target
+  - ${model.output_preds_key}
+  - ${model.output_target_key}
 kwarg_keys: null
diff --git a/mart/configs/model/modular.yaml b/mart/configs/model/modular.yaml
index f4a6976f..6c137a53 100644
--- a/mart/configs/model/modular.yaml
+++ b/mart/configs/model/modular.yaml
@@ -1,6 +1,9 @@
 _target_: mart.models.LitModular
 _convert_: all
 
+output_preds_key: "preds"
+output_target_key: "target"
+
 modules: ???
 optimizer: ???
 

From fe8a0f8169577498939bb3ccc1598ebfe679ccd0 Mon Sep 17 00:00:00 2001
From: Cory Cornelius <cory.cornelius@intel.com>
Date: Fri, 23 Jun 2023 08:48:33 -0700
Subject: [PATCH 15/23] Add torchvision YOLO model from
 https://github.com/pytorch/vision/pull/7496

---
 mart/models/detection/__init__.py        |   18 +
 mart/models/detection/anchor_utils.py    |  324 ++++
 mart/models/detection/box_utils.py       |   81 +
 mart/models/detection/target_matching.py |  453 +++++
 mart/models/detection/yolo.py            |  438 +++++
 mart/models/detection/yolo_loss.py       |  363 ++++
 mart/models/detection/yolo_networks.py   | 2026 ++++++++++++++++++++++
 mart/models/yolo.py                      |  732 ++++++++
 8 files changed, 4435 insertions(+)
 create mode 100644 mart/models/detection/__init__.py
 create mode 100644 mart/models/detection/anchor_utils.py
 create mode 100644 mart/models/detection/box_utils.py
 create mode 100644 mart/models/detection/target_matching.py
 create mode 100644 mart/models/detection/yolo.py
 create mode 100644 mart/models/detection/yolo_loss.py
 create mode 100644 mart/models/detection/yolo_networks.py
 create mode 100644 mart/models/yolo.py

diff --git a/mart/models/detection/__init__.py b/mart/models/detection/__init__.py
new file mode 100644
index 00000000..4bf1a515
--- /dev/null
+++ b/mart/models/detection/__init__.py
@@ -0,0 +1,18 @@
+# https://raw.githubusercontent.com/pytorch/vision/ae30df455405fb56946425bf3f3c318280b0a7ae/torchvision/models/detection/__init__.py
+from .faster_rcnn import *
+from .fcos import *
+from .keypoint_rcnn import *
+from .mask_rcnn import *
+from .retinanet import *
+from .ssd import *
+from .ssdlite import *
+from .yolo import YOLO, yolo_darknet, yolov4, YOLOV4_Backbone_Weights, YOLOV4_Weights
+from .yolo_networks import (
+    DarknetNetwork,
+    YOLOV4Network,
+    YOLOV4P6Network,
+    YOLOV4TinyNetwork,
+    YOLOV5Network,
+    YOLOV7Network,
+    YOLOXNetwork,
+)
diff --git a/mart/models/detection/anchor_utils.py b/mart/models/detection/anchor_utils.py
new file mode 100644
index 00000000..943071b0
--- /dev/null
+++ b/mart/models/detection/anchor_utils.py
@@ -0,0 +1,324 @@
+# https://raw.githubusercontent.com/pytorch/vision/ae30df455405fb56946425bf3f3c318280b0a7ae/torchvision/models/detection/anchor_utils.py
+
+import math
+from typing import List, Optional
+
+import torch
+from torch import nn, Tensor
+
+from .image_list import ImageList
+
+
+class AnchorGenerator(nn.Module):
+    """
+    Module that generates anchors for a set of feature maps and
+    image sizes.
+
+    The module support computing anchors at multiple sizes and aspect ratios
+    per feature map. This module assumes aspect ratio = height / width for
+    each anchor.
+
+    sizes and aspect_ratios should have the same number of elements, and it should
+    correspond to the number of feature maps.
+
+    sizes[i] and aspect_ratios[i] can have an arbitrary number of elements,
+    and AnchorGenerator will output a set of sizes[i] * aspect_ratios[i] anchors
+    per spatial location for feature map i.
+
+    Args:
+        sizes (Tuple[Tuple[int]]):
+        aspect_ratios (Tuple[Tuple[float]]):
+    """
+
+    __annotations__ = {
+        "cell_anchors": List[torch.Tensor],
+    }
+
+    def __init__(
+        self,
+        sizes=((128, 256, 512),),
+        aspect_ratios=((0.5, 1.0, 2.0),),
+    ):
+        super().__init__()
+
+        if not isinstance(sizes[0], (list, tuple)):
+            # TODO change this
+            sizes = tuple((s,) for s in sizes)
+        if not isinstance(aspect_ratios[0], (list, tuple)):
+            aspect_ratios = (aspect_ratios,) * len(sizes)
+
+        self.sizes = sizes
+        self.aspect_ratios = aspect_ratios
+        self.cell_anchors = [
+            self.generate_anchors(size, aspect_ratio) for size, aspect_ratio in zip(sizes, aspect_ratios)
+        ]
+
+    # TODO: https://github.com/pytorch/pytorch/issues/26792
+    # For every (aspect_ratios, scales) combination, output a zero-centered anchor with those values.
+    # (scales, aspect_ratios) are usually an element of zip(self.scales, self.aspect_ratios)
+    # This method assumes aspect ratio = height / width for an anchor.
+    def generate_anchors(
+        self,
+        scales: List[int],
+        aspect_ratios: List[float],
+        dtype: torch.dtype = torch.float32,
+        device: torch.device = torch.device("cpu"),
+    ) -> Tensor:
+        scales = torch.as_tensor(scales, dtype=dtype, device=device)
+        aspect_ratios = torch.as_tensor(aspect_ratios, dtype=dtype, device=device)
+        h_ratios = torch.sqrt(aspect_ratios)
+        w_ratios = 1 / h_ratios
+
+        ws = (w_ratios[:, None] * scales[None, :]).view(-1)
+        hs = (h_ratios[:, None] * scales[None, :]).view(-1)
+
+        base_anchors = torch.stack([-ws, -hs, ws, hs], dim=1) / 2
+        return base_anchors.round()
+
+    def set_cell_anchors(self, dtype: torch.dtype, device: torch.device):
+        self.cell_anchors = [cell_anchor.to(dtype=dtype, device=device) for cell_anchor in self.cell_anchors]
+
+    def num_anchors_per_location(self) -> List[int]:
+        return [len(s) * len(a) for s, a in zip(self.sizes, self.aspect_ratios)]
+
+    # For every combination of (a, (g, s), i) in (self.cell_anchors, zip(grid_sizes, strides), 0:2),
+    # output g[i] anchors that are s[i] distance apart in direction i, with the same dimensions as a.
+    def grid_anchors(self, grid_sizes: List[List[int]], strides: List[List[Tensor]]) -> List[Tensor]:
+        anchors = []
+        cell_anchors = self.cell_anchors
+        torch._assert(cell_anchors is not None, "cell_anchors should not be None")
+        torch._assert(
+            len(grid_sizes) == len(strides) == len(cell_anchors),
+            "Anchors should be Tuple[Tuple[int]] because each feature "
+            "map could potentially have different sizes and aspect ratios. "
+            "There needs to be a match between the number of "
+            "feature maps passed and the number of sizes / aspect ratios specified.",
+        )
+
+        for size, stride, base_anchors in zip(grid_sizes, strides, cell_anchors):
+            grid_height, grid_width = size
+            stride_height, stride_width = stride
+            device = base_anchors.device
+
+            # For output anchor, compute [x_center, y_center, x_center, y_center]
+            shifts_x = torch.arange(0, grid_width, dtype=torch.int32, device=device) * stride_width
+            shifts_y = torch.arange(0, grid_height, dtype=torch.int32, device=device) * stride_height
+            shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x, indexing="ij")
+            shift_x = shift_x.reshape(-1)
+            shift_y = shift_y.reshape(-1)
+            shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1)
+
+            # For every (base anchor, output anchor) pair,
+            # offset each zero-centered base anchor by the center of the output anchor.
+            anchors.append((shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4))
+
+        return anchors
+
+    def forward(self, image_list: ImageList, feature_maps: List[Tensor]) -> List[Tensor]:
+        grid_sizes = [feature_map.shape[-2:] for feature_map in feature_maps]
+        image_size = image_list.tensors.shape[-2:]
+        dtype, device = feature_maps[0].dtype, feature_maps[0].device
+        strides = [
+            [
+                torch.empty((), dtype=torch.int64, device=device).fill_(image_size[0] // g[0]),
+                torch.empty((), dtype=torch.int64, device=device).fill_(image_size[1] // g[1]),
+            ]
+            for g in grid_sizes
+        ]
+        self.set_cell_anchors(dtype, device)
+        anchors_over_all_feature_maps = self.grid_anchors(grid_sizes, strides)
+        anchors: List[List[torch.Tensor]] = []
+        for _ in range(len(image_list.image_sizes)):
+            anchors_in_image = [anchors_per_feature_map for anchors_per_feature_map in anchors_over_all_feature_maps]
+            anchors.append(anchors_in_image)
+        anchors = [torch.cat(anchors_per_image) for anchors_per_image in anchors]
+        return anchors
+
+
+class DefaultBoxGenerator(nn.Module):
+    """
+    This module generates the default boxes of SSD for a set of feature maps and image sizes.
+
+    Args:
+        aspect_ratios (List[List[int]]): A list with all the aspect ratios used in each feature map.
+        min_ratio (float): The minimum scale :math:`\text{s}_{\text{min}}` of the default boxes used in the estimation
+            of the scales of each feature map. It is used only if the ``scales`` parameter is not provided.
+        max_ratio (float): The maximum scale :math:`\text{s}_{\text{max}}`  of the default boxes used in the estimation
+            of the scales of each feature map. It is used only if the ``scales`` parameter is not provided.
+        scales (List[float]], optional): The scales of the default boxes. If not provided it will be estimated using
+            the ``min_ratio`` and ``max_ratio`` parameters.
+        steps (List[int]], optional): It's a hyper-parameter that affects the tiling of default boxes. If not provided
+            it will be estimated from the data.
+        clip (bool): Whether the standardized values of default boxes should be clipped between 0 and 1. The clipping
+            is applied while the boxes are encoded in format ``(cx, cy, w, h)``.
+    """
+
+    def __init__(
+        self,
+        aspect_ratios: List[List[int]],
+        min_ratio: float = 0.15,
+        max_ratio: float = 0.9,
+        scales: Optional[List[float]] = None,
+        steps: Optional[List[int]] = None,
+        clip: bool = True,
+    ):
+        super().__init__()
+        if steps is not None and len(aspect_ratios) != len(steps):
+            raise ValueError("aspect_ratios and steps should have the same length")
+        self.aspect_ratios = aspect_ratios
+        self.steps = steps
+        self.clip = clip
+        num_outputs = len(aspect_ratios)
+
+        # Estimation of default boxes scales
+        if scales is None:
+            if num_outputs > 1:
+                range_ratio = max_ratio - min_ratio
+                self.scales = [min_ratio + range_ratio * k / (num_outputs - 1.0) for k in range(num_outputs)]
+                self.scales.append(1.0)
+            else:
+                self.scales = [min_ratio, max_ratio]
+        else:
+            self.scales = scales
+
+        self._wh_pairs = self._generate_wh_pairs(num_outputs)
+
+    def _generate_wh_pairs(
+        self, num_outputs: int, dtype: torch.dtype = torch.float32, device: torch.device = torch.device("cpu")
+    ) -> List[Tensor]:
+        _wh_pairs: List[Tensor] = []
+        for k in range(num_outputs):
+            # Adding the 2 default width-height pairs for aspect ratio 1 and scale s'k
+            s_k = self.scales[k]
+            s_prime_k = math.sqrt(self.scales[k] * self.scales[k + 1])
+            wh_pairs = [[s_k, s_k], [s_prime_k, s_prime_k]]
+
+            # Adding 2 pairs for each aspect ratio of the feature map k
+            for ar in self.aspect_ratios[k]:
+                sq_ar = math.sqrt(ar)
+                w = self.scales[k] * sq_ar
+                h = self.scales[k] / sq_ar
+                wh_pairs.extend([[w, h], [h, w]])
+
+            _wh_pairs.append(torch.as_tensor(wh_pairs, dtype=dtype, device=device))
+        return _wh_pairs
+
+    def num_anchors_per_location(self) -> List[int]:
+        # Estimate num of anchors based on aspect ratios: 2 default boxes + 2 * ratios of feaure map.
+        return [2 + 2 * len(r) for r in self.aspect_ratios]
+
+    # Default Boxes calculation based on page 6 of SSD paper
+    def _grid_default_boxes(
+        self, grid_sizes: List[List[int]], image_size: List[int], dtype: torch.dtype = torch.float32
+    ) -> Tensor:
+        default_boxes = []
+        for k, f_k in enumerate(grid_sizes):
+            # Now add the default boxes for each width-height pair
+            if self.steps is not None:
+                x_f_k = image_size[1] / self.steps[k]
+                y_f_k = image_size[0] / self.steps[k]
+            else:
+                y_f_k, x_f_k = f_k
+
+            shifts_x = ((torch.arange(0, f_k[1]) + 0.5) / x_f_k).to(dtype=dtype)
+            shifts_y = ((torch.arange(0, f_k[0]) + 0.5) / y_f_k).to(dtype=dtype)
+            shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x, indexing="ij")
+            shift_x = shift_x.reshape(-1)
+            shift_y = shift_y.reshape(-1)
+
+            shifts = torch.stack((shift_x, shift_y) * len(self._wh_pairs[k]), dim=-1).reshape(-1, 2)
+            # Clipping the default boxes while the boxes are encoded in format (cx, cy, w, h)
+            _wh_pair = self._wh_pairs[k].clamp(min=0, max=1) if self.clip else self._wh_pairs[k]
+            wh_pairs = _wh_pair.repeat((f_k[0] * f_k[1]), 1)
+
+            default_box = torch.cat((shifts, wh_pairs), dim=1)
+
+            default_boxes.append(default_box)
+
+        return torch.cat(default_boxes, dim=0)
+
+    def __repr__(self) -> str:
+        s = (
+            f"{self.__class__.__name__}("
+            f"aspect_ratios={self.aspect_ratios}"
+            f", clip={self.clip}"
+            f", scales={self.scales}"
+            f", steps={self.steps}"
+            ")"
+        )
+        return s
+
+    def forward(self, image_list: ImageList, feature_maps: List[Tensor]) -> List[Tensor]:
+        grid_sizes = [feature_map.shape[-2:] for feature_map in feature_maps]
+        image_size = image_list.tensors.shape[-2:]
+        dtype, device = feature_maps[0].dtype, feature_maps[0].device
+        default_boxes = self._grid_default_boxes(grid_sizes, image_size, dtype=dtype)
+        default_boxes = default_boxes.to(device)
+
+        dboxes = []
+        x_y_size = torch.tensor([image_size[1], image_size[0]], device=default_boxes.device)
+        for _ in image_list.image_sizes:
+            dboxes_in_image = default_boxes
+            dboxes_in_image = torch.cat(
+                [
+                    (dboxes_in_image[:, :2] - 0.5 * dboxes_in_image[:, 2:]) * x_y_size,
+                    (dboxes_in_image[:, :2] + 0.5 * dboxes_in_image[:, 2:]) * x_y_size,
+                ],
+                -1,
+            )
+            dboxes.append(dboxes_in_image)
+        return dboxes
+
+
+def grid_offsets(grid_size: Tensor) -> Tensor:
+    """Given a grid size, returns a tensor containing offsets to the grid cells.
+
+    Args:
+        The width and height of the grid in a tensor.
+
+    Returns:
+        A ``[height, width, 2]`` tensor containing the grid cell `(x, y)` offsets.
+    """
+    x_range = torch.arange(grid_size[0].item(), device=grid_size.device)
+    y_range = torch.arange(grid_size[1].item(), device=grid_size.device)
+    grid_y, grid_x = torch.meshgrid([y_range, x_range], indexing="ij")
+    return torch.stack((grid_x, grid_y), -1)
+
+
+def grid_centers(grid_size: Tensor) -> Tensor:
+    """Given a grid size, returns a tensor containing coordinates to the centers of the grid cells.
+
+    Returns:
+        A ``[height, width, 2]`` tensor containing coordinates to the centers of the grid cells.
+    """
+    return grid_offsets(grid_size) + 0.5
+
+
+@torch.jit.script
+def global_xy(xy: Tensor, image_size: Tensor) -> Tensor:
+    """Adds offsets to the predicted box center coordinates to obtain global coordinates to the image.
+
+    The predicted coordinates are interpreted as coordinates inside a grid cell whose width and height is 1. Adding
+    offset to the cell, dividing by the grid size, and multiplying by the image size, we get global coordinates in the
+    image scale.
+
+    The function needs the ``@torch.jit.script`` decorator in order for ONNX generation to work. The tracing based
+    generator will loose track of e.g. ``xy.shape[1]`` and treat it as a Python variable and not a tensor. This will
+    cause the dimension to be treated as a constant in the model, which prevents dynamic input sizes.
+
+    Args:
+        xy: The predicted center coordinates before scaling. Values from zero to one in a tensor sized
+            ``[batch_size, height, width, boxes_per_cell, 2]``.
+        image_size: Width and height in a vector that will be used to scale the coordinates.
+
+    Returns:
+        Global coordinates scaled to the size of the network input image, in a tensor with the same shape as the input
+        tensor.
+    """
+    height = xy.shape[1]
+    width = xy.shape[2]
+    grid_size = torch.tensor([width, height], device=xy.device)
+    # Scripting requires explicit conversion to a floating point type.
+    offset = grid_offsets(grid_size).to(xy.dtype).unsqueeze(2)  # [height, width, 1, 2]
+    scale = torch.true_divide(image_size, grid_size)
+    return (xy + offset) * scale
diff --git a/mart/models/detection/box_utils.py b/mart/models/detection/box_utils.py
new file mode 100644
index 00000000..0813961b
--- /dev/null
+++ b/mart/models/detection/box_utils.py
@@ -0,0 +1,81 @@
+# https://raw.githubusercontent.com/pytorch/vision/ae30df455405fb56946425bf3f3c318280b0a7ae/torchvision/models/detection/box_utils.py
+import torch
+from torch import Tensor
+
+from ...ops import box_iou
+
+
+def aligned_iou(wh1: Tensor, wh2: Tensor) -> Tensor:
+    """Calculates a matrix of intersections over union from box dimensions, assuming that the boxes are located at
+    the same coordinates.
+
+    Args:
+        wh1: An ``[N, 2]`` matrix of box shapes (width and height).
+        wh2: An ``[M, 2]`` matrix of box shapes (width and height).
+
+    Returns:
+        An ``[N, M]`` matrix of pairwise IoU values for every element in ``wh1`` and ``wh2``
+    """
+    area1 = wh1[:, 0] * wh1[:, 1]  # [N]
+    area2 = wh2[:, 0] * wh2[:, 1]  # [M]
+
+    inter_wh = torch.min(wh1[:, None, :], wh2)  # [N, M, 2]
+    inter = inter_wh[:, :, 0] * inter_wh[:, :, 1]  # [N, M]
+    union = area1[:, None] + area2 - inter  # [N, M]
+
+    return inter / union
+
+
+def iou_below(pred_boxes: Tensor, target_boxes: Tensor, threshold: float) -> Tensor:
+    """Creates a binary mask whose value will be ``True``, unless the predicted box overlaps any target
+    significantly (IoU greater than ``threshold``).
+
+    Args:
+        pred_boxes: The predicted corner coordinates. Tensor of size ``[height, width, boxes_per_cell, 4]``.
+        target_boxes: Corner coordinates of the target boxes. Tensor of size ``[height, width, boxes_per_cell, 4]``.
+
+    Returns:
+        A boolean tensor sized ``[height, width, boxes_per_cell]``, with ``False`` where the predicted box overlaps a
+        target significantly and ``True`` elsewhere.
+    """
+    shape = pred_boxes.shape[:-1]
+    pred_boxes = pred_boxes.view(-1, 4)
+    ious = box_iou(pred_boxes, target_boxes)
+    best_iou = ious.max(-1).values
+    below_threshold = best_iou <= threshold
+    return below_threshold.view(shape)
+
+
+def is_inside_box(points: Tensor, boxes: Tensor) -> Tensor:
+    """Get pairwise truth values of whether the point is inside the box.
+
+    Args:
+        points: Point (x, y) coordinates, a tensor shaped ``[points, 2]``.
+        boxes: Box (x1, y1, x2, y2) coordinates, a tensor shaped ``[boxes, 4]``.
+
+    Returns:
+        A tensor shaped ``[points, boxes]`` containing pairwise truth values of whether the points are inside the boxes.
+    """
+    lt = points[:, None, :] - boxes[None, :, :2]  # [boxes, points, 2]
+    rb = boxes[None, :, 2:] - points[:, None, :]  # [boxes, points, 2]
+    deltas = torch.cat((lt, rb), -1)  # [points, boxes, 4]
+    return deltas.min(-1).values > 0.0  # [points, boxes]
+
+
+def box_size_ratio(wh1: Tensor, wh2: Tensor) -> Tensor:
+    """Compares the dimensions of the boxes pairwise.
+
+    For each pair of boxes, calculates the largest ratio that can be obtained by dividing the widths with each other or
+    dividing the heights with each other.
+
+    Args:
+        wh1: An ``[N, 2]`` matrix of box shapes (width and height).
+        wh2: An ``[M, 2]`` matrix of box shapes (width and height).
+
+    Returns:
+        An ``[N, M]`` matrix of ratios of width or height dimensions, whichever is larger.
+    """
+    wh_ratio = wh1[:, None, :] / wh2[None, :, :]  # [M, N, 2]
+    wh_ratio = torch.max(wh_ratio, 1.0 / wh_ratio)
+    wh_ratio = wh_ratio.max(2).values  # [M, N]
+    return wh_ratio
diff --git a/mart/models/detection/target_matching.py b/mart/models/detection/target_matching.py
new file mode 100644
index 00000000..7e71457c
--- /dev/null
+++ b/mart/models/detection/target_matching.py
@@ -0,0 +1,453 @@
+# https://raw.githubusercontent.com/pytorch/vision/ae30df455405fb56946425bf3f3c318280b0a7ae/torchvision/models/detection/target_matching.py
+from typing import Dict, List, Tuple
+
+import torch
+from torch import Tensor
+
+from ...ops import box_convert
+from .anchor_utils import grid_centers
+from .box_utils import aligned_iou, box_size_ratio, iou_below, is_inside_box
+from .yolo_loss import YOLOLoss
+
+PRIOR_SHAPES = List[List[int]]  # TorchScript doesn't allow a list of tuples.
+
+
+def target_boxes_to_grid(preds: Tensor, targets: Tensor, image_size: Tensor) -> Tuple[Tensor, Tensor]:
+    """Scales target bounding boxes to feature map coordinates.
+
+    It would be better to implement this in a super class, but TorchScript doesn't allow class inheritance.
+
+    Args:
+        preds: Predicted bounding boxes for a single image.
+        targets: Target bounding boxes for a single image.
+        image_size: Input image width and height.
+
+    Returns:
+        Two tensors with as many rows as there are targets. An integer tensor containing x/y coordinates to the feature
+        map that correspond to the target position, and a floating point tensor containing the target width and height
+        scaled to the feature map size.
+    """
+    height, width = preds.shape[:2]
+
+    # A multiplier for scaling image coordinates to feature map coordinates
+    grid_size = torch.tensor([width, height], device=image_size.device)
+    image_to_grid = torch.true_divide(grid_size, image_size)
+
+    # Bounding box center coordinates are converted to the feature map dimensions so that the whole number tells the
+    # cell index and the fractional part tells the location inside the cell.
+    xywh = box_convert(targets, in_fmt="xyxy", out_fmt="cxcywh")
+    xy = (xywh[:, :2] * image_to_grid).to(torch.int64)
+    x = xy[:, 0].clamp(0, width - 1)
+    y = xy[:, 1].clamp(0, height - 1)
+    xy = torch.stack((x, y), 1)
+    return xy, xywh[:, 2:]
+
+
+class HighestIoUMatching:
+    """For each target, select the prior shape that gives the highest IoU.
+
+    This is the original YOLO matching rule.
+
+    Args:
+        prior_shapes: A list of all the prior box dimensions. The list should contain [width, height] pairs in the
+            network input resolution.
+        prior_shape_idxs: List of indices to ``prior_shapes`` that is used to select the (usually 3) prior shapes that
+            this layer uses.
+        ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the prior shape has IoU
+            with some target greater than this threshold, the predictor will not be taken into account when calculating
+            the confidence loss.
+    """
+
+    def __init__(
+        self, prior_shapes: PRIOR_SHAPES, prior_shape_idxs: List[int], ignore_bg_threshold: float = 0.7
+    ) -> None:
+        self.prior_shapes = prior_shapes
+        # anchor_map maps the anchor indices to anchors in this layer, or to -1 if it's not an anchor of this layer.
+        # This layer ignores the target if all the selected anchors are in another layer.
+        self.anchor_map = [
+            prior_shape_idxs.index(idx) if idx in prior_shape_idxs else -1 for idx in range(len(prior_shapes))
+        ]
+        self.ignore_bg_threshold = ignore_bg_threshold
+
+    def match(self, wh: Tensor) -> Tuple[Tensor, Tensor]:
+        """Selects anchors for each target based on the predicted shapes. The subclasses implement this method.
+
+        Args:
+            wh: A matrix of predicted width and height values.
+
+        Returns:
+            matched_targets, matched_anchors: Two vectors. The first vector is used to select the targets that this
+            layer matched and the second one lists the matching anchors within the grid cell.
+        """
+        prior_wh = torch.tensor(self.prior_shapes, dtype=wh.dtype, device=wh.device)
+        anchor_map = torch.tensor(self.anchor_map, dtype=torch.int64, device=wh.device)
+
+        ious = aligned_iou(wh, prior_wh)
+        highest_iou_anchors = ious.max(1).indices
+        highest_iou_anchors = anchor_map[highest_iou_anchors]
+        matched_targets = highest_iou_anchors >= 0
+        matched_anchors = highest_iou_anchors[matched_targets]
+        return matched_targets, matched_anchors
+
+    def __call__(
+        self,
+        preds: Dict[str, Tensor],
+        targets: Dict[str, Tensor],
+        image_size: Tensor,
+    ) -> Tuple[List[Tensor], Tensor, Tensor]:
+        """For each target, selects predictions from the same grid cell, where the center of the target box is.
+
+        Typically there are three predictions per grid cell. Subclasses implement ``match()``, which selects the
+        predictions within the grid cell.
+
+        Args:
+            preds: Predictions for a single image.
+            targets: Training targets for a single image.
+            image_size: Input image width and height.
+
+        Returns:
+            The indices of the matched predictions, background mask, and a mask for selecting the matched targets.
+        """
+        anchor_xy, target_wh = target_boxes_to_grid(preds["boxes"], targets["boxes"], image_size)
+        target_selector, anchor_idx = self.match(target_wh)
+        anchor_x = anchor_xy[target_selector, 0]
+        anchor_y = anchor_xy[target_selector, 1]
+
+        # Background mask is used to select anchors that are not responsible for predicting any object, for
+        # calculating the part of the confidence loss with zero as the target confidence. It is set to False, if a
+        # predicted box overlaps any target significantly, or if a prediction is matched to a target.
+        background_mask = iou_below(preds["boxes"], targets["boxes"], self.ignore_bg_threshold)
+        background_mask[anchor_y, anchor_x, anchor_idx] = False
+
+        pred_selector = [anchor_y, anchor_x, anchor_idx]
+        return pred_selector, background_mask, target_selector
+
+
+class IoUThresholdMatching:
+    """For each target, select all prior shapes that give a high enough IoU.
+
+    Args:
+        prior_shapes: A list of all the prior box dimensions. The list should contain [width, height] pairs in the
+            network input resolution.
+        prior_shape_idxs: List of indices to ``prior_shapes`` that is used to select the (usually 3) prior shapes that
+            this layer uses.
+        threshold: IoU treshold for matching.
+        ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the corresponding anchor
+            has IoU with some target greater than this threshold, the predictor will not be taken into account when
+            calculating the confidence loss.
+    """
+
+    def __init__(
+        self,
+        prior_shapes: PRIOR_SHAPES,
+        prior_shape_idxs: List[int],
+        threshold: float,
+        ignore_bg_threshold: float = 0.7,
+    ) -> None:
+        self.prior_shapes = [prior_shapes[idx] for idx in prior_shape_idxs]
+        self.threshold = threshold
+        self.ignore_bg_threshold = ignore_bg_threshold
+
+    def match(self, wh: Tensor) -> Tuple[Tensor, Tensor]:
+        """Selects anchors for each target based on the predicted shapes. The subclasses implement this method.
+
+        Args:
+            wh: A matrix of predicted width and height values.
+
+        Returns:
+            matched_targets, matched_anchors: Two vectors. The first vector is used to select the targets that this
+            layer matched and the second one lists the matching anchors within the grid cell.
+        """
+        prior_wh = torch.tensor(self.prior_shapes, dtype=wh.dtype, device=wh.device)
+
+        ious = aligned_iou(wh, prior_wh)
+        above_threshold = (ious > self.threshold).nonzero()
+        return above_threshold[:, 0], above_threshold[:, 1]
+
+    def __call__(
+        self,
+        preds: Dict[str, Tensor],
+        targets: Dict[str, Tensor],
+        image_size: Tensor,
+    ) -> Tuple[List[Tensor], Tensor, Tensor]:
+        """For each target, selects predictions from the same grid cell, where the center of the target box is.
+
+        Typically there are three predictions per grid cell. Subclasses implement ``match()``, which selects the
+        predictions within the grid cell.
+
+        Args:
+            preds: Predictions for a single image.
+            targets: Training targets for a single image.
+            image_size: Input image width and height.
+
+        Returns:
+            The indices of the matched predictions, background mask, and a mask for selecting the matched targets.
+        """
+        anchor_xy, target_wh = target_boxes_to_grid(preds["boxes"], targets["boxes"], image_size)
+        target_selector, anchor_idx = self.match(target_wh)
+        anchor_x = anchor_xy[target_selector, 0]
+        anchor_y = anchor_xy[target_selector, 1]
+
+        # Background mask is used to select anchors that are not responsible for predicting any object, for
+        # calculating the part of the confidence loss with zero as the target confidence. It is set to False, if a
+        # predicted box overlaps any target significantly, or if a prediction is matched to a target.
+        background_mask = iou_below(preds["boxes"], targets["boxes"], self.ignore_bg_threshold)
+        background_mask[anchor_y, anchor_x, anchor_idx] = False
+
+        pred_selector = [anchor_y, anchor_x, anchor_idx]
+        return pred_selector, background_mask, target_selector
+
+
+class SizeRatioMatching:
+    """For each target, select those prior shapes, whose width and height relative to the target is below given
+    ratio.
+
+    This is the matching rule used by Ultralytics YOLOv5 implementation.
+
+    Args:
+        prior_shapes: A list of all the prior box dimensions. The list should contain [width, height] pairs in the
+            network input resolution.
+        prior_shape_idxs: List of indices to ``prior_shapes`` that is used to select the (usually 3) prior shapes that
+            this layer uses.
+        threshold: Size ratio threshold for matching.
+        ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the corresponding anchor
+            has IoU with some target greater than this threshold, the predictor will not be taken into account when
+            calculating the confidence loss.
+    """
+
+    def __init__(
+        self,
+        prior_shapes: PRIOR_SHAPES,
+        prior_shape_idxs: List[int],
+        threshold: float,
+        ignore_bg_threshold: float = 0.7,
+    ) -> None:
+        self.prior_shapes = [prior_shapes[idx] for idx in prior_shape_idxs]
+        self.threshold = threshold
+        self.ignore_bg_threshold = ignore_bg_threshold
+
+    def match(self, wh: Tensor) -> Tuple[Tensor, Tensor]:
+        """Selects anchors for each target based on the predicted shapes. The subclasses implement this method.
+
+        Args:
+            wh: A matrix of predicted width and height values.
+
+        Returns:
+            matched_targets, matched_anchors: Two vectors. The first vector is used to select the targets that this
+            layer matched and the second one lists the matching anchors within the grid cell.
+        """
+        prior_wh = torch.tensor(self.prior_shapes, dtype=wh.dtype, device=wh.device)
+        below_threshold = (box_size_ratio(wh, prior_wh) < self.threshold).nonzero()
+        return below_threshold[:, 0], below_threshold[:, 1]
+
+    def __call__(
+        self,
+        preds: Dict[str, Tensor],
+        targets: Dict[str, Tensor],
+        image_size: Tensor,
+    ) -> Tuple[List[Tensor], Tensor, Tensor]:
+        """For each target, selects predictions from the same grid cell, where the center of the target box is.
+
+        Typically there are three predictions per grid cell. Subclasses implement ``match()``, which selects the
+        predictions within the grid cell.
+
+        Args:
+            preds: Predictions for a single image.
+            targets: Training targets for a single image.
+            image_size: Input image width and height.
+
+        Returns:
+            The indices of the matched predictions, background mask, and a mask for selecting the matched targets.
+        """
+        anchor_xy, target_wh = target_boxes_to_grid(preds["boxes"], targets["boxes"], image_size)
+        target_selector, anchor_idx = self.match(target_wh)
+        anchor_x = anchor_xy[target_selector, 0]
+        anchor_y = anchor_xy[target_selector, 1]
+
+        # Background mask is used to select anchors that are not responsible for predicting any object, for
+        # calculating the part of the confidence loss with zero as the target confidence. It is set to False, if a
+        # predicted box overlaps any target significantly, or if a prediction is matched to a target.
+        background_mask = iou_below(preds["boxes"], targets["boxes"], self.ignore_bg_threshold)
+        background_mask[anchor_y, anchor_x, anchor_idx] = False
+
+        pred_selector = [anchor_y, anchor_x, anchor_idx]
+        return pred_selector, background_mask, target_selector
+
+
+def _sim_ota_match(costs: Tensor, ious: Tensor) -> Tuple[Tensor, Tensor]:
+    """Implements the SimOTA matching rule.
+
+    The number of units supplied by each supplier (training target) needs to be decided in the Optimal Transport
+    problem. "Dynamic k Estimation" uses the sum of the top 10 IoU values (casted to int) between the target and the
+    predicted boxes.
+
+    Args:
+        costs: A ``[predictions, targets]`` matrix of losses.
+        ious: A ``[predictions, targets]`` matrix of IoUs.
+
+    Returns:
+        A mask of predictions that were matched, and the indices of the matched targets. The latter contains as many
+        elements as there are ``True`` values in the mask.
+    """
+    num_preds, num_targets = ious.shape
+
+    matching_matrix = torch.zeros_like(costs, dtype=torch.bool)
+
+    if ious.numel() > 0:
+        # For each target, define k as the sum of the 10 highest IoUs.
+        top10_iou = torch.topk(ious, min(10, num_preds), dim=0).values.sum(0)
+        ks = torch.clip(top10_iou.int(), min=1)
+        assert len(ks) == num_targets
+
+        # For each target, select k predictions with the lowest cost.
+        for target_idx, (target_costs, k) in enumerate(zip(costs.T, ks)):
+            pred_idx = torch.topk(target_costs, k, largest=False).indices
+            matching_matrix[pred_idx, target_idx] = True
+
+        # If there's more than one match for some prediction, match it with the best target. Now we consider all
+        # targets, regardless of whether they were originally matched with the prediction or not.
+        more_than_one_match = matching_matrix.sum(1) > 1
+        best_targets = costs[more_than_one_match, :].argmin(1)
+        matching_matrix[more_than_one_match, :] = False
+        matching_matrix[more_than_one_match, best_targets] = True
+
+    # For those predictions that were matched, get the index of the target.
+    pred_mask = matching_matrix.sum(1) > 0
+    target_selector = matching_matrix[pred_mask, :].int().argmax(1)
+    return pred_mask, target_selector
+
+
+class SimOTAMatching:
+    """Selects which anchors are used to predict each target using the SimOTA matching rule.
+
+    This is the matching rule used by YOLOX.
+
+    Args:
+        prior_shapes: A list of all the prior box dimensions. The list should contain [width, height] pairs in the
+            network input resolution.
+        prior_shape_idxs: List of indices to ``prior_shapes`` that is used to select the (usually 3) prior shapes that
+            this layer uses.
+        loss_func: A ``YOLOLoss`` object that can be used to calculate the pairwise costs.
+        spatial_range: For each target, restrict to the anchors that are within an `N x N` grid cell are centered at the
+            target, where `N` is the value of this parameter.
+        size_range: For each target, restrict to the anchors whose prior dimensions are not larger than the target
+            dimensions multiplied by this value and not smaller than the target dimensions divided by this value.
+    """
+
+    def __init__(
+        self,
+        prior_shapes: PRIOR_SHAPES,
+        prior_shape_idxs: List[int],
+        loss_func: YOLOLoss,
+        spatial_range: float,
+        size_range: float,
+    ) -> None:
+        self.prior_shapes = [prior_shapes[idx] for idx in prior_shape_idxs]
+        self.loss_func = loss_func
+        self.spatial_range = spatial_range
+        self.size_range = size_range
+
+    def __call__(
+        self,
+        preds: Dict[str, Tensor],
+        targets: Dict[str, Tensor],
+        image_size: Tensor,
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        """For each target, selects predictions using the SimOTA matching rule.
+
+        Args:
+            preds: Predictions for a single image.
+            targets: Training targets for a single image.
+            image_size: Input image width and height.
+
+        Returns:
+            A mask of predictions that were matched, background mask (inverse of the first mask), and the indices of the
+            matched targets. The last tensor contains as many elements as there are ``True`` values in the first mask.
+        """
+        height, width, boxes_per_cell, _ = preds["boxes"].shape
+        prior_mask, anchor_inside_target = self._get_prior_mask(targets, image_size, width, height, boxes_per_cell)
+        prior_preds = {
+            "boxes": preds["boxes"][prior_mask],
+            "confidences": preds["confidences"][prior_mask],
+            "classprobs": preds["classprobs"][prior_mask],
+        }
+
+        losses, ious = self.loss_func.pairwise(prior_preds, targets, input_is_normalized=False)
+        costs = losses.overlap + losses.confidence + losses.classification
+        costs += 100000.0 * ~anchor_inside_target
+        pred_mask, target_selector = _sim_ota_match(costs, ious)
+
+        # Add the anchor dimension to the mask and replace True values with the results of the actual SimOTA matching.
+        pred_selector = prior_mask.nonzero().T.tolist()
+        prior_mask[pred_selector] = pred_mask
+
+        background_mask = torch.logical_not(prior_mask)
+
+        return prior_mask, background_mask, target_selector
+
+    def _get_prior_mask(
+        self,
+        targets: Dict[str, Tensor],
+        image_size: Tensor,
+        grid_width: int,
+        grid_height: int,
+        boxes_per_cell: int,
+    ) -> Tuple[Tensor, Tensor]:
+        """Creates a mask for selecting the "center prior" anchors.
+
+        In the first step we restrict ourselves to the grid cells whose center is inside or close enough to one or more
+        targets.
+
+        Args:
+            targets: Training targets for a single image.
+            image_size: Input image width and height.
+            grid_width: Width of the feature grid.
+            grid_height: Height of the feature grid.
+            boxes_per_cell: Number of boxes that will be predicted per feature grid cell.
+
+        Returns:
+            Two masks, a ``[grid_height, grid_width, boxes_per_cell]`` mask for selecting anchors that are close and
+            similar in shape to a target, and an ``[anchors, targets]`` matrix that indicates which targets are inside
+            those anchors.
+        """
+        # A multiplier for scaling feature map coordinates to image coordinates
+        grid_size = torch.tensor([grid_width, grid_height], device=targets["boxes"].device)
+        grid_to_image = torch.true_divide(image_size, grid_size)
+
+        # Get target center coordinates and dimensions.
+        xywh = box_convert(targets["boxes"], in_fmt="xyxy", out_fmt="cxcywh")
+        xy = xywh[:, :2]
+        wh = xywh[:, 2:]
+
+        # Create a [boxes_per_cell, targets] tensor for selecting prior shapes that are close enough to the target
+        # dimensions.
+        prior_wh = torch.tensor(self.prior_shapes, device=targets["boxes"].device)
+        shape_selector = box_size_ratio(prior_wh, wh) < self.size_range
+
+        # Create a [grid_cells, targets] tensor for selecting spatial locations that are inside target bounding boxes.
+        centers = grid_centers(grid_size).view(-1, 2) * grid_to_image
+        inside_selector = is_inside_box(centers, targets["boxes"])
+
+        # Combine the above selectors into a [grid_cells, boxes_per_cell, targets] tensor for selecting anchors that are
+        # inside target bounding boxes and close enough shape.
+        inside_selector = inside_selector[:, None, :].repeat(1, boxes_per_cell, 1)
+        inside_selector = torch.logical_and(inside_selector, shape_selector)
+
+        # Set the width and height of all target bounding boxes to self.range grid cells and create a selector for
+        # anchors that are now inside the boxes. If a small target has no anchors inside its bounding box, it will be
+        # matched to one of these anchors, but a high penalty will ensure that anchors that are inside the bounding box
+        # will be preferred.
+        wh = self.spatial_range * grid_to_image * torch.ones_like(xy)
+        xywh = torch.cat((xy, wh), -1)
+        boxes = box_convert(xywh, in_fmt="cxcywh", out_fmt="xyxy")
+        close_selector = is_inside_box(centers, boxes)
+
+        # Create a [grid_cells, boxes_per_cell, targets] tensor for selecting anchors that are spatially close to a
+        # target and whose shape is close enough to the target.
+        close_selector = close_selector[:, None, :].repeat(1, boxes_per_cell, 1)
+        close_selector = torch.logical_and(close_selector, shape_selector)
+
+        mask = torch.logical_or(inside_selector, close_selector).sum(-1) > 0
+        mask = mask.view(grid_height, grid_width, boxes_per_cell)
+        inside_selector = inside_selector.view(grid_height, grid_width, boxes_per_cell, -1)
+        return mask, inside_selector[mask]
diff --git a/mart/models/detection/yolo.py b/mart/models/detection/yolo.py
new file mode 100644
index 00000000..d8f23732
--- /dev/null
+++ b/mart/models/detection/yolo.py
@@ -0,0 +1,438 @@
+# https://raw.githubusercontent.com/pytorch/vision/ae30df455405fb56946425bf3f3c318280b0a7ae/torchvision/models/detection/yolo.py
+import warnings
+from typing import Any, Dict, List, Optional, Union
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from ...ops import batched_nms
+from ...transforms import functional as F
+from .._api import register_model, Weights, WeightsEnum
+from .._utils import _ovewrite_value_param
+from ..yolo import YOLOV4Backbone
+from .backbone_utils import _validate_trainable_layers
+from .yolo_networks import DarknetNetwork, PRED, TARGET, TARGETS, YOLOV4Network
+
+IMAGES = List[Tensor]  # TorchScript doesn't allow a tuple.
+
+
+class YOLO(nn.Module):
+    """YOLO implementation that supports the most important features of YOLOv3, YOLOv4, YOLOv5, YOLOv7, Scaled-
+    YOLOv4, and YOLOX.
+
+    *YOLOv3 paper*: `Joseph Redmon and Ali Farhadi <https://arxiv.org/abs/1804.02767>`__
+
+    *YOLOv4 paper*: `Alexey Bochkovskiy, Chien-Yao Wang, and Hong-Yuan Mark Liao <https://arxiv.org/abs/2004.10934>`__
+
+    *YOLOv7 paper*: `Chien-Yao Wang, Alexey Bochkovskiy, and Hong-Yuan Mark Liao <https://arxiv.org/abs/2207.02696>`__
+
+    *Scaled-YOLOv4 paper*: `Chien-Yao Wang, Alexey Bochkovskiy, and Hong-Yuan Mark Liao
+    <https://arxiv.org/abs/2011.08036>`__
+
+    *YOLOX paper*: `Zheng Ge, Songtao Liu, Feng Wang, Zeming Li, and Jian Sun <https://arxiv.org/abs/2107.08430>`__
+
+    The network architecture can be written in PyTorch, or read from a Darknet configuration file using the
+    :class:`~.yolo_networks.DarknetNetwork` class. ``DarknetNetwork`` is also able to read weights that have been saved
+    by Darknet.
+
+    The input is expected to be a list of images. Each image is a tensor with shape ``[channels, height, width]``. The
+    images from a single batch will be stacked into a single tensor, so the sizes have to match. Different batches can
+    have different image sizes, as long as the size is divisible by the ratio in which the network downsamples the
+    input.
+
+    During training, the model expects both the image tensors and a list of targets. It's possible to train a model
+    using one integer class label per box, but the YOLO model supports also multiple labels per box. For multi-label
+    training, simply use a boolean matrix that indicates which classes are assigned to which boxes, in place of the
+    class labels. *Each target is a dictionary containing the following tensors*:
+
+    - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in `(x1, y1, x2, y2)` format
+    - labels (``Int64Tensor[N]`` or ``BoolTensor[N, classes]``): the class label or a boolean class mask for each
+      ground-truth box
+
+    :func:`~.yolo.YOLO.forward` method returns all predictions from all detection layers in one tensor with shape
+    ``[N, anchors, classes + 5]``, where ``anchors`` is the total number of anchors in all detection layers. The
+    coordinates are scaled to the input image size. During training it also returns a dictionary containing the
+    classification, box overlap, and confidence losses.
+
+    During inference, the model requires only the image tensor. :func:`~.yolo.YOLO.infer` method filters and
+    processes the predictions. If a prediction has a high score for more than one class, it will be duplicated. *The
+    processed output is returned in a dictionary containing the following tensors*:
+
+    - boxes (``FloatTensor[N, 4]``): predicted bounding box `(x1, y1, x2, y2)` coordinates in image space
+    - scores (``FloatTensor[N]``): detection confidences
+    - labels (``Int64Tensor[N]``): the predicted labels for each object
+
+    Detection using a Darknet configuration and pretrained weights:
+
+        >>> from urllib.request import urlretrieve
+        >>> import torch
+        >>> from torchvision.models.detection import DarknetNetwork, YOLO
+        >>>
+        >>> urlretrieve("https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4-tiny-3l.cfg", "yolov4-tiny-3l.cfg")
+        >>> urlretrieve("https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-tiny.conv.29", "yolov4-tiny.conv.29")
+        >>> network = DarknetNetwork("yolov4-tiny-3l.cfg", "yolov4-tiny.conv.29")
+        >>> model = YOLO(network)
+        >>> image = torch.rand(3, 608, 608)
+        >>> detections = model.infer(image)
+
+    Detection using a predefined YOLOv4 network:
+
+        >>> import torch
+        >>> from torchvision.models.detection import YOLOV4Network, YOLO
+        >>>
+        >>> network = YOLOV4Network(num_classes=91)
+        >>> model = YOLO(network)
+        >>> image = torch.rand(3, 608, 608)
+        >>> detections = model.infer(image)
+
+    Args:
+        network: A module that represents the network layers. This can be obtained from a Darknet configuration using
+            :func:`~.yolo_networks.DarknetNetwork`, or it can be defined as PyTorch code.
+        confidence_threshold: Postprocessing will remove bounding boxes whose confidence score is not higher than this
+            threshold.
+        nms_threshold: Non-maximum suppression will remove bounding boxes whose IoU with a higher confidence box is
+            higher than this threshold, if the predicted categories are equal.
+        detections_per_image: Keep at most this number of highest-confidence detections per image.
+    """
+
+    def __init__(
+        self,
+        network: nn.Module,
+        confidence_threshold: float = 0.2,
+        nms_threshold: float = 0.45,
+        detections_per_image: int = 300,
+    ) -> None:
+        super().__init__()
+
+        self.network = network
+        self.confidence_threshold = confidence_threshold
+        self.nms_threshold = nms_threshold
+        self.detections_per_image = detections_per_image
+
+    def forward(
+        self, images: Union[Tensor, IMAGES], targets: Optional[TARGETS] = None
+    ) -> Union[Tensor, Dict[str, Tensor]]:
+        """Runs a forward pass through the network (all layers listed in ``self.network``), and if training targets
+        are provided, computes the losses from the detection layers.
+
+        Detections are concatenated from the detection layers. Each detection layer will produce a number of detections
+        that depends on the size of the feature map and the number of anchors per feature map cell.
+
+        Args:
+            images: A tensor of size ``[batch_size, channels, height, width]`` containing a batch of images or a list of
+                image tensors.
+            targets: Compute losses against these targets. A list of dictionaries, one for each image. Must be given in
+                training mode.
+
+        Returns:
+            If targets are given, returns a dictionary containing the three losses (overlap, confidence, and
+            classification). Otherwise returns detections in a tensor shaped ``[batch_size, anchors, classes + 5]``,
+            where ``anchors`` is the total number of anchors in all detection layers. The number of anchors in a
+            detection layer is the feature map size (width * height) times the number of anchors per cell (usually 3 or
+            4). The predicted box coordinates are in `(x1, y1, x2, y2)` format and scaled to the input image size.
+        """
+        self.validate_batch(images, targets)
+        images_tensor = images if isinstance(images, Tensor) else torch.stack(images)
+        detections, losses, hits = self.network(images_tensor, targets)
+
+        if targets is None:
+            detections = torch.cat(detections, 1)
+            return detections
+
+        losses = torch.stack(losses).sum(0)
+        return {"overlap": losses[0], "confidence": losses[1], "classification": losses[2]}
+
+    def infer(self, image: Tensor) -> PRED:
+        """Feeds an image to the network and returns the detected bounding boxes, confidence scores, and class
+        labels.
+
+        If a prediction has a high score for more than one class, it will be duplicated.
+
+        Args:
+            image: An input image, a tensor of uint8 values sized ``[channels, height, width]``.
+
+        Returns:
+            A dictionary containing tensors "boxes", "scores", and "labels". "boxes" is a matrix of detected bounding
+            box `(x1, y1, x2, y2)` coordinates. "scores" is a vector of confidence scores for the bounding box
+            detections. "labels" is a vector of predicted class labels.
+        """
+        if not isinstance(image, Tensor):
+            image = F.to_tensor(image)
+
+        was_training = self.training
+        self.eval()
+
+        detections = self([image])
+        detections = self.process_detections(detections)
+        detections = detections[0]
+
+        if was_training:
+            self.train()
+        return detections
+
+    def process_detections(self, preds: Tensor) -> List[PRED]:
+        """Splits the detection tensor returned by a forward pass into a list of prediction dictionaries, and
+        filters them based on confidence threshold, non-maximum suppression (NMS), and maximum number of
+        predictions.
+
+        If for any single detection there are multiple categories whose score is above the confidence threshold, the
+        detection will be duplicated to create one detection for each category. NMS processes one category at a time,
+        iterating over the bounding boxes in descending order of confidence score, and removes lower scoring boxes that
+        have an IoU greater than the NMS threshold with a higher scoring box.
+
+        The returned detections are sorted by descending confidence. The items of the dictionaries are as follows:
+        - boxes (``Tensor[batch_size, N, 4]``): detected bounding box `(x1, y1, x2, y2)` coordinates
+        - scores (``Tensor[batch_size, N]``): detection confidences
+        - labels (``Int64Tensor[batch_size, N]``): the predicted class IDs
+
+        Args:
+            preds: A tensor of detected bounding boxes and their attributes.
+
+        Returns:
+            Filtered detections. A list of prediction dictionaries, one for each image.
+        """
+
+        def process(boxes: Tensor, confidences: Tensor, classprobs: Tensor) -> Dict[str, Any]:
+            scores = classprobs * confidences[:, None]
+
+            # Select predictions with high scores. If a prediction has a high score for more than one class, it will be
+            # duplicated.
+            idxs, labels = (scores > self.confidence_threshold).nonzero().T
+            boxes = boxes[idxs]
+            scores = scores[idxs, labels]
+
+            keep = batched_nms(boxes, scores, labels, self.nms_threshold)
+            keep = keep[: self.detections_per_image]
+            return {"boxes": boxes[keep], "scores": scores[keep], "labels": labels[keep]}
+
+        return [process(p[..., :4], p[..., 4], p[..., 5:]) for p in preds]
+
+    def process_targets(self, targets: TARGETS) -> List[TARGET]:
+        """Duplicates multi-label targets to create one target for each label.
+
+        Args:
+            targets: List of target dictionaries. Each dictionary must contain "boxes" and "labels". "labels" is either
+                a one-dimensional list of class IDs, or a two-dimensional boolean class map.
+
+        Returns:
+            Single-label targets. A list of target dictionaries, one for each image.
+        """
+
+        def process(boxes: Tensor, labels: Tensor, **other: Any) -> Dict[str, Any]:
+            if labels.ndim == 2:
+                idxs, labels = labels.nonzero().T
+                boxes = boxes[idxs]
+            return {"boxes": boxes, "labels": labels, **other}
+
+        return [process(**t) for t in targets]
+
+    def validate_batch(self, images: Union[Tensor, IMAGES], targets: Optional[TARGETS]) -> None:
+        """Validates the format of a batch of data.
+
+        Args:
+            images: A tensor containing a batch of images or a list of image tensors.
+            targets: A list of target dictionaries or ``None``. If a list is provided, there should be as many target
+                dictionaries as there are images.
+        """
+        if not isinstance(images, Tensor):
+            if not isinstance(images, (tuple, list)):
+                raise TypeError(f"Expected images to be a Tensor, tuple, or a list, got {type(images).__name__}.")
+            if not images:
+                raise ValueError("No images in batch.")
+            shape = images[0].shape
+            for image in images:
+                if not isinstance(image, Tensor):
+                    raise ValueError(f"Expected image to be of type Tensor, got {type(image).__name__}.")
+                if image.shape != shape:
+                    raise ValueError(f"Images with different shapes in one batch: {shape} and {image.shape}")
+
+        if targets is None:
+            if self.training:
+                raise ValueError("Targets should be given in training mode.")
+            else:
+                return
+
+        if not isinstance(targets, (tuple, list)):
+            raise TypeError(f"Expected targets to be a tuple or a list, got {type(images).__name__}.")
+        if len(images) != len(targets):
+            raise ValueError(f"Got {len(images)} images, but targets for {len(targets)} images.")
+
+        for target in targets:
+            if "boxes" not in target:
+                raise ValueError("Target dictionary doesn't contain boxes.")
+            boxes = target["boxes"]
+            if not isinstance(boxes, Tensor):
+                raise TypeError(f"Expected target boxes to be of type Tensor, got {type(boxes).__name__}.")
+            if (boxes.ndim != 2) or (boxes.shape[-1] != 4):
+                raise ValueError(f"Expected target boxes to be tensors of shape [N, 4], got {list(boxes.shape)}.")
+            if "labels" not in target:
+                raise ValueError("Target dictionary doesn't contain labels.")
+            labels = target["labels"]
+            if not isinstance(labels, Tensor):
+                raise ValueError(f"Expected target labels to be of type Tensor, got {type(labels).__name__}.")
+            if (labels.ndim < 1) or (labels.ndim > 2) or (len(labels) != len(boxes)):
+                raise ValueError(
+                    f"Expected target labels to be tensors of shape [N] or [N, num_classes], got {list(labels.shape)}."
+                )
+
+
+class YOLOV4_Backbone_Weights(WeightsEnum):
+    # TODO: Create pretrained weights.
+    DEFAULT = Weights(
+        url="",
+        transforms=lambda x: x,
+        meta={},
+    )
+
+
+class YOLOV4_Weights(WeightsEnum):
+    # TODO: Create pretrained weights.
+    DEFAULT = Weights(
+        url="",
+        transforms=lambda x: x,
+        meta={},
+    )
+
+
+def freeze_backbone_layers(backbone: nn.Module, trainable_layers: Optional[int], is_trained: bool) -> None:
+    """Freezes backbone layers layers that won't be used for training.
+
+    Args:
+        backbone: The backbone network.
+        trainable_layers: Number of trainable layers (stages), starting from the final stage.
+        is_trained: Set to ``True`` when using pre-trained weights. Otherwise will issue a warning if
+            ``trainable_layers`` is set.
+    """
+    if not hasattr(backbone, "stages"):
+        warnings.warn("Cannot freeze backbone layers. Backbone object has no 'stages' attribute.")
+    num_layers = len(backbone.stages)  # type: ignore
+    trainable_layers = _validate_trainable_layers(is_trained, trainable_layers, num_layers, 3)
+
+    layers_to_train = [f"stages.{idx}" for idx in range(num_layers - trainable_layers, num_layers)]
+    if trainable_layers == num_layers:
+        layers_to_train.append("stem")
+
+    for name, parameter in backbone.named_parameters():
+        if all([not name.startswith(layer) for layer in layers_to_train]):
+            parameter.requires_grad_(False)
+
+
+@register_model()
+def yolov4(
+    weights: Optional[YOLOV4_Weights] = None,
+    progress: bool = True,
+    in_channels: int = 3,
+    num_classes: Optional[int] = None,
+    weights_backbone: Optional[YOLOV4_Backbone_Weights] = None,
+    trainable_backbone_layers: Optional[int] = None,
+    confidence_threshold: float = 0.2,
+    nms_threshold: float = 0.45,
+    detections_per_image: int = 300,
+    **kwargs: Any,
+) -> YOLO:
+    """
+    Constructs a YOLOv4 model.
+
+    .. betastatus:: detection module
+
+    Example:
+
+        >>> import torch
+        >>> from torchvision.models.detection import yolov4, YOLOV4_Weights
+        >>>
+        >>> model = yolov4(weights=YOLOV4_Weights.DEFAULT)
+        >>> image = torch.rand(3, 608, 608)
+        >>> detections = model.infer(image)
+
+    Args:
+        weights: Pretrained weights to use. See :class:`~.YOLOV4_Weights` below for more details and possible values. By
+            default, the model will be initialized randomly.
+        progress: If ``True``, displays a progress bar of the download to ``stderr``.
+        in_channels: Number of channels in the input image.
+        num_classes: Number of output classes of the model (including the background). By default, this value is set to
+            91 or read from the weights.
+        weights_backbone: Pretrained weights for the backbone. See :class:`~.YOLOV4_Backbone_Weights` below for more
+            details and possible values. By default, the backbone will be initialized randomly.
+        trainable_backbone_layers: Number of trainable (not frozen) layers (stages), starting from the final stage.
+            Valid values are between 0 and the number of stages in the backbone. By default, this value is set to 3.
+        confidence_threshold: Postprocessing will remove bounding boxes whose confidence score is not higher than this
+            threshold.
+        nms_threshold: Non-maximum suppression will remove bounding boxes whose IoU with a higher confidence box is
+            higher than this threshold, if the predicted categories are equal.
+        detections_per_image: Keep at most this number of highest-confidence detections per image.
+        **kwargs: Parameters passed to the ``torchvision.models.detection.YOLOV4Network`` class. Please refer to the
+            `source code <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/yolo_networks.py>`_
+            for more details about this class.
+
+    .. autoclass:: .YOLOV4_Weights
+        :members:
+
+    .. autoclass:: .YOLOV4_Backbone_Weights
+        :members:
+    """
+    weights = YOLOV4_Weights.verify(weights)
+    weights_backbone = YOLOV4_Backbone_Weights.verify(weights_backbone)
+
+    if weights is not None:
+        weights_backbone = None
+        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
+    elif num_classes is None:
+        num_classes = 91
+
+    backbone_kwargs = {key: kwargs[key] for key in ("widths", "activation", "normalization") if key in kwargs}
+    backbone = YOLOV4Backbone(in_channels, **backbone_kwargs)
+
+    is_trained = weights is not None or weights_backbone is not None
+    freeze_backbone_layers(backbone, trainable_backbone_layers, is_trained)
+
+    if weights_backbone is not None:
+        backbone.load_state_dict(weights_backbone.get_state_dict(progress=progress))
+
+    network = YOLOV4Network(num_classes, backbone, **kwargs)
+    model = YOLO(network, confidence_threshold, nms_threshold, detections_per_image)
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress))
+
+    return model
+
+
+def yolo_darknet(
+    config_path: str,
+    weights_path: Optional[str] = None,
+    confidence_threshold: float = 0.2,
+    nms_threshold: float = 0.45,
+    detections_per_image: int = 300,
+    **kwargs: Any,
+) -> YOLO:
+    """
+    Constructs a YOLO model from a Darknet configuration file.
+
+    .. betastatus:: detection module
+
+    Example:
+
+        >>> from urllib.request import urlretrieve
+        >>> from torchvision.models.detection import yolo_darknet
+        >>>
+        >>> urlretrieve("https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4-tiny-3l.cfg", "yolov4-tiny-3l.cfg")
+        >>> urlretrieve("https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-tiny.conv.29", "yolov4-tiny.conv.29")
+        >>> model = yolo_darknet("yolov4-tiny-3l.cfg", "yolov4-tiny.conv.29")
+        >>> image = torch.rand(3, 608, 608)
+        >>> detections = model.infer(image)
+
+    Args:
+        config_path: Path to a Darknet configuration file that defines the network architecture.
+        weights_path: Path to a Darknet weights file to load.
+        confidence_threshold: Postprocessing will remove bounding boxes whose confidence score is not higher than this
+            threshold.
+        nms_threshold: Non-maximum suppression will remove bounding boxes whose IoU with a higher confidence box is
+            higher than this threshold, if the predicted categories are equal.
+        detections_per_image: Keep at most this number of highest-confidence detections per image.
+        **kwargs: Parameters passed to the ``torchvision.models.detection.DarknetNetwork`` class. Please refer to the
+            `source code <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/yolo_networks.py>`_
+            for more details about this class.
+    """
+    network = DarknetNetwork(config_path, weights_path, **kwargs)
+    return YOLO(network, confidence_threshold, nms_threshold, detections_per_image)
diff --git a/mart/models/detection/yolo_loss.py b/mart/models/detection/yolo_loss.py
new file mode 100644
index 00000000..e6bd69da
--- /dev/null
+++ b/mart/models/detection/yolo_loss.py
@@ -0,0 +1,363 @@
+# https://raw.githubusercontent.com/pytorch/vision/ae30df455405fb56946425bf3f3c318280b0a7ae/torchvision/models/detection/yolo_loss.py
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple
+
+import torch
+from torch import Tensor
+from torch.nn.functional import binary_cross_entropy, binary_cross_entropy_with_logits, one_hot
+
+from torchvision.ops import (
+    box_iou,
+    complete_box_iou,
+    complete_box_iou_loss,
+    distance_box_iou,
+    distance_box_iou_loss,
+    generalized_box_iou,
+    generalized_box_iou_loss,
+)
+
+
+def _binary_cross_entropy(
+    inputs: Tensor, targets: Tensor, reduction: str = "mean", input_is_normalized: bool = True
+) -> Tensor:
+    """Returns the binary cross entropy from either normalized inputs or logits.
+
+    It would be more convenient to pass the correct cross entropy function to every function that uses it, but
+    TorchScript doesn't allow passing functions.
+
+    Args:
+        inputs: Probabilities in a tensor of an arbitrary shape.
+        targets: Targets in a tensor of the same shape as ``input``.
+        reduction: Specifies the reduction to apply to the output. ``'none'``: no reduction will be applied, ``'mean'``:
+            the sum of the output will be divided by the number of elements in the output, ``'sum'``: the output will be
+            summed.
+        input_is_normalized: If ``False``, input is logits, if ``True``, input is normalized to `0..1`.
+    """
+    if input_is_normalized:
+        return binary_cross_entropy(inputs, targets, reduction=reduction)
+    else:
+        return binary_cross_entropy_with_logits(inputs, targets, reduction=reduction)
+
+
+def box_iou_loss(boxes1: Tensor, boxes2: Tensor) -> Tensor:
+    return 1.0 - box_iou(boxes1, boxes2).diagonal()
+
+
+def _size_compensation(targets: Tensor, image_size: Tensor) -> Tensor:
+    """Calcuates the size compensation factor for the overlap loss.
+
+    The overlap losses for each target should be multiplied by the returned weight. The returned value is
+    `2 - (unit_width * unit_height)`, which is large for small boxes (the maximum value is 2) and small for large boxes
+    (the minimum value is 1).
+
+    Args:
+        targets: An ``[N, 4]`` matrix of target `(x1, y1, x2, y2)` coordinates.
+        image_size: Image size, which is used to scale the target boxes to unit coordinates.
+
+    Returns:
+        The size compensation factor.
+    """
+    unit_wh = targets[:, 2:] / image_size
+    return 2 - (unit_wh[:, 0] * unit_wh[:, 1])
+
+
+def _pairwise_confidence_loss(
+    preds: Tensor, overlap: Tensor, input_is_normalized: bool, predict_overlap: Optional[float]
+) -> Tensor:
+    """Calculates the confidence loss for every pair of a foreground anchor and a target.
+
+    If ``predict_overlap`` is ``None``, the target confidence will be 1. If ``predict_overlap`` is 1.0, ``overlap`` will
+    be used as the target confidence. Otherwise this parameter defines a balance between these two targets. The method
+    returns a vector of losses for each foreground anchor.
+
+    Args:
+        preds: An ``[N]`` vector of predicted confidences.
+        overlap: An ``[N, M]`` matrix of overlaps between all predicted and target bounding boxes.
+        input_is_normalized: If ``False``, input is logits, if ``True``, input is normalized to `0..1`.
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that the target
+            confidence is 1 if there's an object, and 1.0 means that the target confidence is the overlap.
+
+    Returns:
+        An ``[N, M]`` matrix of confidence losses between all predictions and targets.
+    """
+    if predict_overlap is not None:
+        # When predicting overlap, target confidence is different for each pair of a prediction and a target. The
+        # tensors have to be broadcasted to [N, M].
+        preds = preds.unsqueeze(1).expand(overlap.shape)
+        targets = torch.ones_like(preds) - predict_overlap
+        # Distance-IoU may return negative "overlaps", so we have to make sure that the targets are not negative.
+        targets += predict_overlap * overlap.detach().clamp(min=0)
+        return _binary_cross_entropy(preds, targets, reduction="none", input_is_normalized=input_is_normalized)
+    else:
+        # When not predicting overlap, target confidence is the same for every prediction, but we should still return a
+        # matrix.
+        targets = torch.ones_like(preds)
+        result = _binary_cross_entropy(preds, targets, reduction="none", input_is_normalized=input_is_normalized)
+        return result.unsqueeze(1).expand(overlap.shape)
+
+
+def _foreground_confidence_loss(
+    preds: Tensor, overlap: Tensor, input_is_normalized: bool, predict_overlap: Optional[float]
+) -> Tensor:
+    """Calculates the sum of the confidence losses for foreground anchors and their matched targets.
+
+    If ``predict_overlap`` is ``None``, the target confidence will be 1. If ``predict_overlap`` is 1.0, ``overlap`` will
+    be used as the target confidence. Otherwise this parameter defines a balance between these two targets. The method
+    returns a vector of losses for each foreground anchor.
+
+    Args:
+        preds: A vector of predicted confidences.
+        overlap: A vector of overlaps between matched target and predicted bounding boxes.
+        input_is_normalized: If ``False``, input is logits, if ``True``, input is normalized to `0..1`.
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that the target
+            confidence is 1, and 1.0 means that the target confidence is the overlap.
+
+    Returns:
+        The sum of the confidence losses for foreground anchors.
+    """
+    targets = torch.ones_like(preds)
+    if predict_overlap is not None:
+        targets -= predict_overlap
+        # Distance-IoU may return negative "overlaps", so we have to make sure that the targets are not negative.
+        targets += predict_overlap * overlap.detach().clamp(min=0)
+    return _binary_cross_entropy(preds, targets, reduction="sum", input_is_normalized=input_is_normalized)
+
+
+def _background_confidence_loss(preds: Tensor, input_is_normalized: bool) -> Tensor:
+    """Calculates the sum of the confidence losses for background anchors.
+
+    Args:
+        preds: A vector of predicted confidences for background anchors.
+        input_is_normalized: If ``False``, input is logits, if ``True``, input is normalized to `0..1`.
+
+    Returns:
+        The sum of the background confidence losses.
+    """
+    targets = torch.zeros_like(preds)
+    return _binary_cross_entropy(preds, targets, reduction="sum", input_is_normalized=input_is_normalized)
+
+
+def _target_labels_to_probs(
+    targets: Tensor, num_classes: int, dtype: torch.dtype, label_smoothing: Optional[float] = None
+) -> Tensor:
+    """If ``targets`` is a vector of class labels, converts it to a matrix of one-hot class probabilities.
+
+    If label smoothing is disabled, the returned target probabilities will be binary. If label smoothing is enabled, the
+    target probabilities will be, ``(label_smoothing / 2)`` or ``(label_smoothing / 2) + (1.0 - label_smoothing)``. That
+    corresponds to label smoothing with two categories, since the YOLO model does multi-label classification.
+
+    Args:
+        targets: An ``[M, C]`` matrix of target class probabilities or an ``[M]`` vector of class labels.
+        num_classes: The number of classes (C dimension) for the new targets. If ``targets`` is already two-dimensional,
+            checks that the length of the second dimension matches this number.
+        dtype: Floating-point data type to be used for the one-hot targets.
+        label_smoothing: The epsilon parameter (weight) for label smoothing. 0.0 means no smoothing (binary targets),
+            and 1.0 means that the target probabilities are always 0.5.
+
+    Returns:
+        An ``[M, C]`` matrix of target class probabilities.
+    """
+    if targets.ndim == 1:
+        # The data may contain a different number of classes than what the model predicts. In case a label is
+        # greater than the number of predicted classes, it will be mapped to the last class.
+        last_class = torch.tensor(num_classes - 1, device=targets.device)
+        targets = torch.min(targets, last_class)
+        targets = one_hot(targets, num_classes)
+    elif targets.shape[-1] != num_classes:
+        raise ValueError(
+            f"The number of classes in the data ({targets.shape[-1]}) doesn't match the number of classes "
+            f"predicted by the model ({num_classes})."
+        )
+    targets = targets.to(dtype=dtype)
+    if label_smoothing is not None:
+        targets = (label_smoothing / 2) + targets * (1.0 - label_smoothing)
+    return targets
+
+
+@torch.jit.script
+@dataclass
+class Losses:
+    overlap: Tensor
+    confidence: Tensor
+    classification: Tensor
+
+
+class YOLOLoss:
+    """A class for calculating the YOLO losses from predictions and targets.
+
+    If label smoothing is enabled, the target class probabilities will be ``(label_smoothing / 2)`` or
+    ``(label_smoothing / 2) + (1.0 - label_smoothing)``, instead of 0 or 1. That corresponds to label smoothing with two
+    categories, since the YOLO model does multi-label classification.
+
+    Args:
+        overlap_func: Which function to use for calculating the IoU between two sets of boxes. Valid values are "iou",
+            "giou", "diou", and "ciou".
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that the target
+            confidence is 1 if there's an object, and 1.0 means that the target confidence is the output of
+            ``overlap_func``.
+        label_smoothing: The epsilon parameter (weight) for class label smoothing. 0.0 means no smoothing (binary
+            targets), and 1.0 means that the target probabilities are always 0.5.
+        overlap_loss_multiplier: Overlap loss will be scaled by this value.
+        confidence_loss_multiplier: Confidence loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
+    """
+
+    def __init__(
+        self,
+        overlap_func: str = "ciou",
+        predict_overlap: Optional[float] = None,
+        label_smoothing: Optional[float] = None,
+        overlap_multiplier: float = 5.0,
+        confidence_multiplier: float = 1.0,
+        class_multiplier: float = 1.0,
+    ):
+        self.overlap_func = overlap_func
+        self.predict_overlap = predict_overlap
+        self.label_smoothing = label_smoothing
+        self.overlap_multiplier = overlap_multiplier
+        self.confidence_multiplier = confidence_multiplier
+        self.class_multiplier = class_multiplier
+
+    def pairwise(
+        self,
+        preds: Dict[str, Tensor],
+        targets: Dict[str, Tensor],
+        input_is_normalized: bool,
+    ) -> Tuple[Losses, Tensor]:
+        """Calculates matrices containing the losses for all prediction/target pairs.
+
+        This method is called for obtaining costs for SimOTA matching.
+
+        Args:
+            preds: A dictionary of predictions, containing "boxes", "confidences", and "classprobs". Each tensor
+                contains `N` rows.
+            targets: A dictionary of training targets, containing "boxes" and "labels". Each tensor contains `M` rows.
+            input_is_normalized: If ``False``, input is logits, if ``True``, input is normalized to `0..1`.
+
+        Returns:
+            Loss matrices and an overlap matrix. Each matrix is shaped ``[N, M]``.
+        """
+        loss_shape = torch.Size([len(preds["boxes"]), len(targets["boxes"])])
+
+        overlap = self._pairwise_overlap(preds["boxes"], targets["boxes"])
+        assert overlap.shape == loss_shape
+
+        overlap_loss = 1.0 - overlap
+        assert overlap_loss.shape == loss_shape
+
+        confidence_loss = _pairwise_confidence_loss(
+            preds["confidences"], overlap, input_is_normalized, self.predict_overlap
+        )
+        assert confidence_loss.shape == loss_shape
+
+        pred_probs = preds["classprobs"].unsqueeze(1)  # [N, 1, classes]
+        target_probs = _target_labels_to_probs(
+            targets["labels"], pred_probs.shape[-1], pred_probs.dtype, self.label_smoothing
+        )
+        target_probs = target_probs.unsqueeze(0)  # [1, M, classes]
+        pred_probs, target_probs = torch.broadcast_tensors(pred_probs, target_probs)
+        class_loss = _binary_cross_entropy(
+            pred_probs, target_probs, reduction="none", input_is_normalized=input_is_normalized
+        )
+        class_loss = class_loss.sum(-1)
+        assert class_loss.shape == loss_shape
+
+        losses = Losses(
+            overlap_loss * self.overlap_multiplier,
+            confidence_loss * self.confidence_multiplier,
+            class_loss * self.class_multiplier,
+        )
+
+        return losses, overlap
+
+    def elementwise_sums(
+        self,
+        preds: Dict[str, Tensor],
+        targets: Dict[str, Tensor],
+        input_is_normalized: bool,
+        image_size: Tensor,
+    ) -> Losses:
+        """Calculates the sums of the losses for optimization, over prediction/target pairs, assuming the
+        predictions and targets have been matched (there are as many predictions and targets).
+
+        Args:
+            preds: A dictionary of predictions, containing "boxes", "confidences", and "classprobs".
+            targets: A dictionary of training targets, containing "boxes" and "labels".
+            input_is_normalized: If ``False``, input is logits, if ``True``, input is normalized to `0..1`.
+            image_size: Width and height in a vector that defines the scale of the target coordinates.
+
+        Returns:
+            The final losses.
+        """
+        overlap_loss = self._elementwise_overlap_loss(targets["boxes"], preds["boxes"])
+        overlap = 1.0 - overlap_loss
+        overlap_loss = (overlap_loss * _size_compensation(targets["boxes"], image_size)).sum()
+
+        confidence_loss = _foreground_confidence_loss(
+            preds["confidences"], overlap, input_is_normalized, self.predict_overlap
+        )
+        confidence_loss += _background_confidence_loss(preds["bg_confidences"], input_is_normalized)
+
+        pred_probs = preds["classprobs"]
+        target_probs = _target_labels_to_probs(
+            targets["labels"], pred_probs.shape[-1], pred_probs.dtype, self.label_smoothing
+        )
+        class_loss = _binary_cross_entropy(
+            pred_probs, target_probs, reduction="sum", input_is_normalized=input_is_normalized
+        )
+
+        losses = Losses(
+            overlap_loss * self.overlap_multiplier,
+            confidence_loss * self.confidence_multiplier,
+            class_loss * self.class_multiplier,
+        )
+
+        return losses
+
+    def _pairwise_overlap(self, boxes1: Tensor, boxes2: Tensor) -> Tensor:
+        """Returns the pairwise intersection-over-union values between two sets of boxes.
+
+        Uses the IoU function specified in ``self.overlap_func``. It would be better to save the function in a variable,
+        but TorchScript doesn't allow this.
+
+        Args:
+            boxes1: first set of boxes
+            boxes2: second set of boxes
+
+        Returns:
+            A matrix containing the pairwise IoU values for every element in ``boxes1`` and ``boxes2``.
+        """
+        if self.overlap_func == "iou":
+            return box_iou(boxes1, boxes2)
+        elif self.overlap_func == "giou":
+            return generalized_box_iou(boxes1, boxes2)
+        elif self.overlap_func == "diou":
+            return distance_box_iou(boxes1, boxes2)
+        elif self.overlap_func == "ciou":
+            return complete_box_iou(boxes1, boxes2)
+        else:
+            raise ValueError(f"Unknown IoU function '{self.overlap_func}'.")
+
+    def _elementwise_overlap_loss(self, boxes1: Tensor, boxes2: Tensor) -> Tensor:
+        """Returns the elementwise intersection-over-union losses between two sets of boxes.
+
+        Uses the IoU loss function specified in ``self.overlap_func``. It would be better to save the function in a
+        variable, but TorchScript doesn't allow this.
+
+        Args:
+            boxes1: first set of boxes
+            boxes2: second set of boxes
+
+        Returns:
+            A vector containing the IoU losses between corresponding elements in ``boxes1`` and ``boxes2``.
+        """
+        if self.overlap_func == "iou":
+            return box_iou_loss(boxes1, boxes2)
+        elif self.overlap_func == "giou":
+            return generalized_box_iou_loss(boxes1, boxes2)
+        elif self.overlap_func == "diou":
+            return distance_box_iou_loss(boxes1, boxes2)
+        elif self.overlap_func == "ciou":
+            return complete_box_iou_loss(boxes1, boxes2)
+        else:
+            raise ValueError(f"Unknown IoU function '{self.overlap_func}'.")
diff --git a/mart/models/detection/yolo_networks.py b/mart/models/detection/yolo_networks.py
new file mode 100644
index 00000000..3ada7ae1
--- /dev/null
+++ b/mart/models/detection/yolo_networks.py
@@ -0,0 +1,2026 @@
+# https://raw.githubusercontent.com/pytorch/vision/ae30df455405fb56946425bf3f3c318280b0a7ae/torchvision/models/detection/yolo_networks.py
+import io
+import re
+from collections import OrderedDict
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union
+from warnings import warn
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from ...ops import box_convert
+from ..yolo import (
+    Conv,
+    CSPSPP,
+    CSPStage,
+    ELANStage,
+    FastSPP,
+    MaxPool,
+    RouteLayer,
+    ShortcutLayer,
+    YOLOV4Backbone,
+    YOLOV4TinyBackbone,
+    YOLOV5Backbone,
+    YOLOV7Backbone,
+)
+from .anchor_utils import global_xy
+from .target_matching import HighestIoUMatching, IoUThresholdMatching, PRIOR_SHAPES, SimOTAMatching, SizeRatioMatching
+from .yolo_loss import YOLOLoss
+
+DARKNET_CONFIG = Dict[str, Any]
+CREATE_LAYER_OUTPUT = Tuple[nn.Module, int]  # layer, num_outputs
+PRED = Dict[str, Tensor]
+PREDS = List[PRED]  # TorchScript doesn't allow a tuple
+TARGET = Dict[str, Tensor]
+TARGETS = List[TARGET]  # TorchScript doesn't allow a tuple
+NETWORK_OUTPUT = Tuple[List[Tensor], List[Tensor], List[int]]  # detections, losses, hits
+
+
+class DetectionLayer(nn.Module):
+    """A YOLO detection layer.
+
+    A YOLO model has usually 1 - 3 detection layers at different resolutions. The loss is summed from all of them.
+
+    Args:
+        num_classes: Number of different classes that this layer predicts.
+        prior_shapes: A list of prior box dimensions for this layer, used for scaling the predicted dimensions. The list
+            should contain [width, height] pairs in the network input resolution.
+        matching_func: The matching algorithm to be used for assigning targets to anchors.
+        loss_func: ``YOLOLoss`` object for calculating the losses.
+        xy_scale: Eliminate "grid sensitivity" by scaling the box coordinates by this factor. Using a value > 1.0 helps
+            to produce coordinate values close to one.
+        input_is_normalized: The input is normalized by logistic activation in the previous layer. In this case the
+            detection layer will not take the sigmoid of the coordinate and probability predictions, and the width and
+            height are scaled up so that the maximum value is four times the anchor dimension. This is used by the
+            Darknet configurations of Scaled-YOLOv4.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        prior_shapes: PRIOR_SHAPES,
+        matching_func: Callable,
+        loss_func: YOLOLoss,
+        xy_scale: float = 1.0,
+        input_is_normalized: bool = False,
+    ) -> None:
+        super().__init__()
+
+        self.num_classes = num_classes
+        self.prior_shapes = prior_shapes
+        self.matching_func = matching_func
+        self.loss_func = loss_func
+        self.xy_scale = xy_scale
+        self.input_is_normalized = input_is_normalized
+
+    def forward(self, x: Tensor, image_size: Tensor) -> Tuple[Tensor, PREDS]:
+        """Runs a forward pass through this YOLO detection layer.
+
+        Maps cell-local coordinates to global coordinates in the image space, scales the bounding boxes with the
+        anchors, converts the center coordinates to corner coordinates, and maps probabilities to the `]0, 1[` range
+        using sigmoid.
+
+        If targets are given, computes also losses from the predictions and the targets. This layer is responsible only
+        for the targets that best match one of the anchors assigned to this layer. Training losses will be saved to the
+        ``losses`` attribute. ``hits`` attribute will be set to the number of targets that this layer was responsible
+        for. ``losses`` is a tensor of three elements: the overlap, confidence, and classification loss.
+
+        Args:
+            x: The output from the previous layer. The size of this tensor has to be
+                ``[batch_size, anchors_per_cell * (num_classes + 5), height, width]``.
+            image_size: Image width and height in a vector (defines the scale of the predicted and target coordinates).
+
+        Returns:
+            The layer output, with normalized probabilities, in a tensor sized
+            ``[batch_size, anchors_per_cell * height * width, num_classes + 5]`` and a list of dictionaries, containing
+            the same predictions, but with unnormalized probabilities (for loss calculation).
+        """
+        batch_size, num_features, height, width = x.shape
+        num_attrs = self.num_classes + 5
+        anchors_per_cell = num_features // num_attrs
+        if anchors_per_cell != len(self.prior_shapes):
+            raise ValueError(
+                "The model predicts {} bounding boxes per spatial location, but {} prior box dimensions are defined "
+                "for this layer.".format(anchors_per_cell, len(self.prior_shapes))
+            )
+
+        # Reshape the output to have the bounding box attributes of each grid cell on its own row.
+        x = x.permute(0, 2, 3, 1)  # [batch_size, height, width, anchors_per_cell * num_attrs]
+        x = x.view(batch_size, height, width, anchors_per_cell, num_attrs)
+
+        # Take the sigmoid of the bounding box coordinates, confidence score, and class probabilities, unless the input
+        # is normalized by the previous layer activation. Confidence and class losses use the unnormalized values if
+        # possible.
+        norm_x = x if self.input_is_normalized else torch.sigmoid(x)
+        xy = norm_x[..., :2]
+        wh = x[..., 2:4]
+        confidence = x[..., 4]
+        classprob = x[..., 5:]
+        norm_confidence = norm_x[..., 4]
+        norm_classprob = norm_x[..., 5:]
+
+        # Eliminate grid sensitivity. The previous layer should output extremely high values for the sigmoid to produce
+        # x/y coordinates close to one. YOLOv4 solves this by scaling the x/y coordinates.
+        xy = xy * self.xy_scale - 0.5 * (self.xy_scale - 1)
+
+        image_xy = global_xy(xy, image_size)
+        prior_shapes = torch.tensor(self.prior_shapes, dtype=wh.dtype, device=wh.device)
+        if self.input_is_normalized:
+            image_wh = 4 * torch.square(wh) * prior_shapes
+        else:
+            image_wh = torch.exp(wh) * prior_shapes
+        box = torch.cat((image_xy, image_wh), -1)
+        box = box_convert(box, in_fmt="cxcywh", out_fmt="xyxy")
+        output = torch.cat((box, norm_confidence.unsqueeze(-1), norm_classprob), -1)
+        output = output.reshape(batch_size, height * width * anchors_per_cell, num_attrs)
+
+        # It's better to use binary_cross_entropy_with_logits() for loss computation, so we'll provide the unnormalized
+        # confidence and classprob, when available.
+        preds = [{"boxes": b, "confidences": c, "classprobs": p} for b, c, p in zip(box, confidence, classprob)]
+
+        return output, preds
+
+    def match_targets(
+        self,
+        preds: PREDS,
+        return_preds: PREDS,
+        targets: TARGETS,
+        image_size: Tensor,
+    ) -> Tuple[PRED, TARGET]:
+        """Matches the predictions to targets.
+
+        Args:
+            preds: List of predictions for each image, as returned by the ``forward()`` method of this layer. These will
+                be matched to the training targets.
+            return_preds: List of predictions for each image. The matched predictions will be returned from this list.
+                When calculating the auxiliary loss for deep supervision, predictions from a different layer are used
+                for loss computation.
+            targets: List of training targets for each image.
+            image_size: Width and height in a vector that defines the scale of the target coordinates.
+
+        Returns:
+            Two dictionaries, the matched predictions and targets.
+        """
+        batch_size = len(preds)
+        if (len(targets) != batch_size) or (len(return_preds) != batch_size):
+            raise ValueError("Different batch size for predictions and targets.")
+
+        # Creating lists that are concatenated in the end will confuse TorchScript compilation. Instead, we'll create
+        # tensors and concatenate new matches immediately.
+        pred_boxes = torch.empty((0, 4), device=return_preds[0]["boxes"].device)
+        pred_confidences = torch.empty(0, device=return_preds[0]["confidences"].device)
+        pred_bg_confidences = torch.empty(0, device=return_preds[0]["confidences"].device)
+        pred_classprobs = torch.empty((0, self.num_classes), device=return_preds[0]["classprobs"].device)
+        target_boxes = torch.empty((0, 4), device=targets[0]["boxes"].device)
+        target_labels = torch.empty(0, dtype=torch.int64, device=targets[0]["labels"].device)
+
+        for image_preds, image_return_preds, image_targets in zip(preds, return_preds, targets):
+            if image_targets["boxes"].shape[0] > 0:
+                pred_selector, background_selector, target_selector = self.matching_func(
+                    image_preds, image_targets, image_size
+                )
+                pred_boxes = torch.cat((pred_boxes, image_return_preds["boxes"][pred_selector]))
+                pred_confidences = torch.cat((pred_confidences, image_return_preds["confidences"][pred_selector]))
+                pred_bg_confidences = torch.cat(
+                    (pred_bg_confidences, image_return_preds["confidences"][background_selector])
+                )
+                pred_classprobs = torch.cat((pred_classprobs, image_return_preds["classprobs"][pred_selector]))
+                target_boxes = torch.cat((target_boxes, image_targets["boxes"][target_selector]))
+                target_labels = torch.cat((target_labels, image_targets["labels"][target_selector]))
+            else:
+                pred_bg_confidences = torch.cat((pred_bg_confidences, image_return_preds["confidences"].flatten()))
+
+        matched_preds = {
+            "boxes": pred_boxes,
+            "confidences": pred_confidences,
+            "bg_confidences": pred_bg_confidences,
+            "classprobs": pred_classprobs,
+        }
+        matched_targets = {
+            "boxes": target_boxes,
+            "labels": target_labels,
+        }
+        return matched_preds, matched_targets
+
+    def calculate_losses(
+        self,
+        preds: PREDS,
+        targets: TARGETS,
+        image_size: Tensor,
+        loss_preds: Optional[PREDS] = None,
+    ) -> Tuple[Tensor, int]:
+        """Matches the predictions to targets and computes the losses.
+
+        Args:
+            preds: List of predictions for each image, as returned by ``forward()``. These will be matched to the
+                training targets and used to compute the losses (unless another set of predictions for loss computation
+                is given in ``loss_preds``).
+            targets: List of training targets for each image.
+            image_size: Width and height in a vector that defines the scale of the target coordinates.
+            loss_preds: List of predictions for each image. If given, these will be used for loss computation, instead
+                of the same predictions that were used for matching. This is needed for deep supervision in YOLOv7.
+
+        Returns:
+            A vector of the overlap, confidence, and classification loss, normalized by batch size, and the number of
+            targets that were matched to this layer.
+        """
+        if loss_preds is None:
+            loss_preds = preds
+
+        matched_preds, matched_targets = self.match_targets(preds, loss_preds, targets, image_size)
+
+        losses = self.loss_func.elementwise_sums(matched_preds, matched_targets, self.input_is_normalized, image_size)
+        losses = torch.stack((losses.overlap, losses.confidence, losses.classification)) / len(preds)
+
+        hits = len(matched_targets["boxes"])
+
+        return losses, hits
+
+
+def create_detection_layer(
+    prior_shapes: PRIOR_SHAPES,
+    prior_shape_idxs: List[int],
+    matching_algorithm: Optional[str] = None,
+    matching_threshold: Optional[float] = None,
+    spatial_range: float = 5.0,
+    size_range: float = 4.0,
+    ignore_bg_threshold: float = 0.7,
+    overlap_func: str = "ciou",
+    predict_overlap: Optional[float] = None,
+    label_smoothing: Optional[float] = None,
+    overlap_loss_multiplier: float = 5.0,
+    confidence_loss_multiplier: float = 1.0,
+    class_loss_multiplier: float = 1.0,
+    **kwargs: Any,
+) -> DetectionLayer:
+    """Creates a detection layer module and the required loss function and target matching objects.
+
+    Args:
+        prior_shapes: A list of all the prior box dimensions, used for scaling the predicted dimensions and possibly for
+            matching the targets to the anchors. The list should contain [width, height] pairs in the network input
+            resolution.
+        prior_shape_idxs: List of indices to ``prior_shapes`` that is used to select the (usually 3) prior shapes that
+            this layer uses.
+        matching_algorithm: Which algorithm to use for matching targets to anchors. "simota" (the SimOTA matching rule
+            from YOLOX), "size" (match those prior shapes, whose width and height relative to the target is below given
+            ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
+            gives the highest IoU, default).
+        matching_threshold: Threshold for "size" and "iou" matching algorithms.
+        spatial_range: The "simota" matching algorithm will restrict to anchors that are within an `N x N` grid cell
+            area centered at the target, where `N` is the value of this parameter.
+        size_range: The "simota" matching algorithm will restrict to anchors whose dimensions are no more than `N` and
+            no less than `1/N` times the target dimensions, where `N` is the value of this parameter.
+        ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the corresponding anchor
+            has IoU with some target greater than this threshold, the predictor will not be taken into account when
+            calculating the confidence loss.
+        overlap_func: Which function to use for calculating the IoU between two sets of boxes. Valid values are "iou",
+            "giou", "diou", and "ciou".
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that the target
+            confidence is 1 if there's an object, and 1.0 means that the target confidence is the output of
+            ``overlap_func``.
+        label_smoothing: The epsilon parameter (weight) for class label smoothing. 0.0 means no smoothing (binary
+            targets), and 1.0 means that the target probabilities are always 0.5.
+        overlap_loss_multiplier: Overlap loss will be scaled by this value.
+        confidence_loss_multiplier: Confidence loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
+        num_classes: Number of different classes that this layer predicts.
+        xy_scale: Eliminate "grid sensitivity" by scaling the box coordinates by this factor. Using a value > 1.0 helps
+            to produce coordinate values close to one.
+        input_is_normalized: The input is normalized by logistic activation in the previous layer. In this case the
+            detection layer will not take the sigmoid of the coordinate and probability predictions, and the width and
+            height are scaled up so that the maximum value is four times the anchor dimension. This is used by the
+            Darknet configurations of Scaled-YOLOv4.
+    """
+    matching_func: Callable
+    if matching_algorithm == "simota":
+        loss_func = YOLOLoss(
+            overlap_func, None, None, overlap_loss_multiplier, confidence_loss_multiplier, class_loss_multiplier
+        )
+        matching_func = SimOTAMatching(prior_shapes, prior_shape_idxs, loss_func, spatial_range, size_range)
+    elif matching_algorithm == "size":
+        if matching_threshold is None:
+            raise ValueError("matching_threshold is required with size ratio matching.")
+        matching_func = SizeRatioMatching(prior_shapes, prior_shape_idxs, matching_threshold, ignore_bg_threshold)
+    elif matching_algorithm == "iou":
+        if matching_threshold is None:
+            raise ValueError("matching_threshold is required with IoU threshold matching.")
+        matching_func = IoUThresholdMatching(prior_shapes, prior_shape_idxs, matching_threshold, ignore_bg_threshold)
+    elif matching_algorithm == "maxiou" or matching_algorithm is None:
+        matching_func = HighestIoUMatching(prior_shapes, prior_shape_idxs, ignore_bg_threshold)
+    else:
+        raise ValueError(f"Matching algorithm `{matching_algorithm}´ is unknown.")
+
+    loss_func = YOLOLoss(
+        overlap_func,
+        predict_overlap,
+        label_smoothing,
+        overlap_loss_multiplier,
+        confidence_loss_multiplier,
+        class_loss_multiplier,
+    )
+    layer_shapes = [prior_shapes[i] for i in prior_shape_idxs]
+    return DetectionLayer(prior_shapes=layer_shapes, matching_func=matching_func, loss_func=loss_func, **kwargs)
+
+
+class DetectionStage(nn.Module):
+    """This is a convenience class for running a detection layer.
+
+    It might be cleaner to implement this as a function, but TorchScript allows only specific types in function
+    arguments, not modules.
+    """
+
+    def __init__(self, **kwargs: Any) -> None:
+        super().__init__()
+        self.detection_layer = create_detection_layer(**kwargs)
+
+    def forward(
+        self,
+        layer_input: Tensor,
+        targets: Optional[TARGETS],
+        image_size: Tensor,
+        detections: List[Tensor],
+        losses: List[Tensor],
+        hits: List[int],
+    ) -> None:
+        """Runs the detection layer on the inputs and appends the output to the ``detections`` list.
+
+        If ``targets`` is given, also calculates the losses and appends to the ``losses`` list.
+
+        Args:
+            layer_input: Input to the detection layer.
+            targets: List of training targets for each image.
+            image_size: Width and height in a vector that defines the scale of the target coordinates.
+            detections: A list where a tensor containing the detections will be appended to.
+            losses: A list where a tensor containing the losses will be appended to, if ``targets`` is given.
+            hits: A list where the number of targets that matched this layer will be appended to, if ``targets`` is
+                given.
+        """
+        output, preds = self.detection_layer(layer_input, image_size)
+        detections.append(output)
+
+        if targets is not None:
+            layer_losses, layer_hits = self.detection_layer.calculate_losses(preds, targets, image_size)
+            losses.append(layer_losses)
+            hits.append(layer_hits)
+
+
+class DetectionStageWithAux(nn.Module):
+    """This class represents a combination of a lead and an auxiliary detection layer.
+
+    Args:
+        spatial_range: The "simota" matching algorithm will restrict to anchors that are within an `N x N` grid cell
+            area centered at the target. This parameter specifies `N` for the lead head.
+        aux_spatial_range: The "simota" matching algorithm will restrict to anchors that are within an `N x N` grid cell
+            area centered at the target. This parameter specifies `N` for the auxiliary head.
+        aux_weight: Weight for the loss from the auxiliary head.
+    """
+
+    def __init__(
+        self, spatial_range: float = 5.0, aux_spatial_range: float = 3.0, aux_weight: float = 0.25, **kwargs: Any
+    ) -> None:
+        super().__init__()
+        self.detection_layer = create_detection_layer(spatial_range=spatial_range, **kwargs)
+        self.aux_detection_layer = create_detection_layer(spatial_range=aux_spatial_range, **kwargs)
+        self.aux_weight = aux_weight
+
+    def forward(
+        self,
+        layer_input: Tensor,
+        aux_input: Tensor,
+        targets: Optional[TARGETS],
+        image_size: Tensor,
+        detections: List[Tensor],
+        losses: List[Tensor],
+        hits: List[int],
+    ) -> None:
+        """Runs the detection layer and the auxiliary detection layer on their respective inputs and appends the
+        outputs to the ``detections`` list.
+
+        If ``targets`` is given, also calculates the losses and appends to the ``losses`` list.
+
+        Args:
+            layer_input: Input to the lead detection layer.
+            aux_input: Input to the auxiliary detection layer.
+            targets: List of training targets for each image.
+            image_size: Width and height in a vector that defines the scale of the target coordinates.
+            detections: A list where a tensor containing the detections will be appended to.
+            losses: A list where a tensor containing the losses will be appended to, if ``targets`` is given.
+            hits: A list where the number of targets that matched this layer will be appended to, if ``targets`` is
+                given.
+        """
+        output, preds = self.detection_layer(layer_input, image_size)
+        detections.append(output)
+
+        if targets is not None:
+            # Match lead head predictions to targets and calculate losses from lead head outputs.
+            layer_losses, layer_hits = self.detection_layer.calculate_losses(preds, targets, image_size)
+            losses.append(layer_losses)
+            hits.append(layer_hits)
+
+            # Match lead head predictions to targets and calculate losses from auxiliary head outputs.
+            _, aux_preds = self.aux_detection_layer(aux_input, image_size)
+            layer_losses, layer_hits = self.aux_detection_layer.calculate_losses(
+                preds, targets, image_size, loss_preds=aux_preds
+            )
+            losses.append(layer_losses * self.aux_weight)
+            hits.append(layer_hits)
+
+
+@torch.jit.script
+def get_image_size(images: Tensor) -> Tensor:
+    """Get the image size from an input tensor.
+
+    The function needs the ``@torch.jit.script`` decorator in order for ONNX generation to work. The tracing based
+    generator will loose track of e.g. ``images.shape[1]`` and treat it as a Python variable and not a tensor. This will
+    cause the dimension to be treated as a constant in the model, which prevents dynamic input sizes.
+
+    Args:
+        images: An image batch to take the width and height from.
+
+    Returns:
+        A tensor that contains the image width and height.
+    """
+    height = images.shape[2]
+    width = images.shape[3]
+    return torch.tensor([width, height], device=images.device)
+
+
+class YOLOV4TinyNetwork(nn.Module):
+    """The "tiny" network architecture from YOLOv4.
+
+    Args:
+        num_classes: Number of different classes that this model predicts.
+        backbone: A backbone network that returns the output from each stage.
+        width: The number of channels in the narrowest convolutional layer. The wider convolutional layers will use a
+            number of channels that is a multiple of this value.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+        prior_shapes: A list of prior box dimensions, used for scaling the predicted dimensions and possibly for
+            matching the targets to the anchors. The list should contain [width, height] pairs in the network input
+            resolution. There should be `3N` pairs, where `N` is the number of anchors per spatial location. They are
+            assigned to the layers from the lowest (high-resolution) to the highest (low-resolution) layer, meaning that
+            you typically want to sort the shapes from the smallest to the largest.
+        matching_algorithm: Which algorithm to use for matching targets to anchors. "simota" (the SimOTA matching rule
+            from YOLOX), "size" (match those prior shapes, whose width and height relative to the target is below given
+            ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
+            gives the highest IoU, default).
+        matching_threshold: Threshold for "size" and "iou" matching algorithms.
+        spatial_range: The "simota" matching algorithm will restrict to anchors that are within an `N x N` grid cell
+            area centered at the target, where `N` is the value of this parameter.
+        size_range: The "simota" matching algorithm will restrict to anchors whose dimensions are no more than `N` and
+            no less than `1/N` times the target dimensions, where `N` is the value of this parameter.
+        ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the prior shape has IoU
+            with some target greater than this threshold, the predictor will not be taken into account when calculating
+            the confidence loss.
+        overlap_func: Which function to use for calculating the IoU between two sets of boxes. Valid values are "iou",
+            "giou", "diou", and "ciou".
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that the target
+            confidence is 1 if there's an object, and 1.0 means that the target confidence is the output of
+            ``overlap_func``.
+        label_smoothing: The epsilon parameter (weight) for class label smoothing. 0.0 means no smoothing (binary
+            targets), and 1.0 means that the target probabilities are always 0.5.
+        overlap_loss_multiplier: Overlap loss will be scaled by this value.
+        confidence_loss_multiplier: Confidence loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
+        xy_scale: Eliminate "grid sensitivity" by scaling the box coordinates by this factor. Using a value > 1.0 helps
+            to produce coordinate values close to one.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        backbone: Optional[nn.Module] = None,
+        width: int = 32,
+        activation: Optional[str] = "leaky",
+        normalization: Optional[str] = "batchnorm",
+        prior_shapes: Optional[PRIOR_SHAPES] = None,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__()
+
+        # By default use the prior shapes that have been learned from the COCO data.
+        if prior_shapes is None:
+            prior_shapes = [
+                [12, 16],
+                [19, 36],
+                [40, 28],
+                [36, 75],
+                [76, 55],
+                [72, 146],
+                [142, 110],
+                [192, 243],
+                [459, 401],
+            ]
+            anchors_per_cell = 3
+        else:
+            anchors_per_cell, modulo = divmod(len(prior_shapes), 3)
+            if modulo != 0:
+                raise ValueError("The number of provided prior shapes needs to be divisible by 3.")
+        num_outputs = (5 + num_classes) * anchors_per_cell
+
+        def conv(in_channels: int, out_channels: int, kernel_size: int = 1) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size, stride=1, activation=activation, norm=normalization)
+
+        def upsample(in_channels: int, out_channels: int) -> nn.Module:
+            channels = conv(in_channels, out_channels)
+            upsample = nn.Upsample(scale_factor=2, mode="nearest")
+            return nn.Sequential(OrderedDict([("channels", channels), ("upsample", upsample)]))
+
+        def outputs(in_channels: int) -> nn.Module:
+            return nn.Conv2d(in_channels, num_outputs, kernel_size=1, stride=1, bias=True)
+
+        def detect(prior_shape_idxs: Sequence[int]) -> DetectionStage:
+            assert prior_shapes is not None
+            return DetectionStage(
+                prior_shapes=prior_shapes,
+                prior_shape_idxs=list(prior_shape_idxs),
+                num_classes=num_classes,
+                input_is_normalized=False,
+                **kwargs,
+            )
+
+        self.backbone = backbone or YOLOV4TinyBackbone(width=width, activation=activation, normalization=normalization)
+
+        self.fpn5 = conv(width * 16, width * 8)
+        self.out5 = nn.Sequential(
+            OrderedDict(
+                [
+                    ("channels", conv(width * 8, width * 16)),
+                    (f"outputs_{num_outputs}", outputs(width * 16)),
+                ]
+            )
+        )
+        self.upsample5 = upsample(width * 8, width * 4)
+
+        self.fpn4 = conv(width * 12, width * 8, kernel_size=3)
+        self.out4 = nn.Sequential(OrderedDict([(f"outputs_{num_outputs}", outputs(width * 8))]))
+        self.upsample4 = upsample(width * 8, width * 2)
+
+        self.fpn3 = conv(width * 6, width * 4, kernel_size=3)
+        self.out3 = nn.Sequential(OrderedDict([(f"outputs_{num_outputs}", outputs(width * 4))]))
+
+        self.detect3 = detect([0, 1, 2])
+        self.detect4 = detect([3, 4, 5])
+        self.detect5 = detect([6, 7, 8])
+
+    def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPUT:
+        detections: List[Tensor] = []  # Outputs from detection layers
+        losses: List[Tensor] = []  # Losses from detection layers
+        hits: List[int] = []  # Number of targets each detection layer was responsible for
+
+        image_size = get_image_size(x)
+
+        c3, c4, c5 = self.backbone(x)[-3:]
+
+        p5 = self.fpn5(c5)
+        x = torch.cat((self.upsample5(p5), c4), dim=1)
+        p4 = self.fpn4(x)
+        x = torch.cat((self.upsample4(p4), c3), dim=1)
+        p3 = self.fpn3(x)
+
+        self.detect5(self.out5(p5), targets, image_size, detections, losses, hits)
+        self.detect4(self.out4(p4), targets, image_size, detections, losses, hits)
+        self.detect3(self.out3(p3), targets, image_size, detections, losses, hits)
+        return detections, losses, hits
+
+
+class YOLOV4Network(nn.Module):
+    """Network architecture that corresponds approximately to the Cross Stage Partial Network from YOLOv4.
+
+    Args:
+        num_classes: Number of different classes that this model predicts.
+        backbone: A backbone network that returns the output from each stage.
+        widths: Number of channels at each network stage.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+        prior_shapes: A list of prior box dimensions, used for scaling the predicted dimensions and possibly for
+            matching the targets to the anchors. The list should contain [width, height] pairs in the network input
+            resolution. There should be `3N` pairs, where `N` is the number of anchors per spatial location. They are
+            assigned to the layers from the lowest (high-resolution) to the highest (low-resolution) layer, meaning that
+            you typically want to sort the shapes from the smallest to the largest.
+        matching_algorithm: Which algorithm to use for matching targets to anchors. "simota" (the SimOTA matching rule
+            from YOLOX), "size" (match those prior shapes, whose width and height relative to the target is below given
+            ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
+            gives the highest IoU, default).
+        matching_threshold: Threshold for "size" and "iou" matching algorithms.
+        spatial_range: The "simota" matching algorithm will restrict to anchors that are within an `N x N` grid cell
+            area centered at the target, where `N` is the value of this parameter.
+        size_range: The "simota" matching algorithm will restrict to anchors whose dimensions are no more than `N` and
+            no less than `1/N` times the target dimensions, where `N` is the value of this parameter.
+        ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the prior shape has IoU
+            with some target greater than this threshold, the predictor will not be taken into account when calculating
+            the confidence loss.
+        overlap_func: Which function to use for calculating the IoU between two sets of boxes. Valid values are "iou",
+            "giou", "diou", and "ciou".
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that the target
+            confidence is 1 if there's an object, and 1.0 means that the target confidence is the output of
+            ``overlap_func``.
+        label_smoothing: The epsilon parameter (weight) for class label smoothing. 0.0 means no smoothing (binary
+            targets), and 1.0 means that the target probabilities are always 0.5.
+        overlap_loss_multiplier: Overlap loss will be scaled by this value.
+        confidence_loss_multiplier: Confidence loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
+        xy_scale: Eliminate "grid sensitivity" by scaling the box coordinates by this factor. Using a value > 1.0 helps
+            to produce coordinate values close to one.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        backbone: Optional[nn.Module] = None,
+        widths: Sequence[int] = (32, 64, 128, 256, 512, 1024),
+        activation: Optional[str] = "silu",
+        normalization: Optional[str] = "batchnorm",
+        prior_shapes: Optional[PRIOR_SHAPES] = None,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__()
+
+        # By default use the prior shapes that have been learned from the COCO data.
+        if prior_shapes is None:
+            prior_shapes = [
+                [12, 16],
+                [19, 36],
+                [40, 28],
+                [36, 75],
+                [76, 55],
+                [72, 146],
+                [142, 110],
+                [192, 243],
+                [459, 401],
+            ]
+            anchors_per_cell = 3
+        else:
+            anchors_per_cell, modulo = divmod(len(prior_shapes), 3)
+            if modulo != 0:
+                raise ValueError("The number of provided prior shapes needs to be divisible by 3.")
+        num_outputs = (5 + num_classes) * anchors_per_cell
+
+        def spp(in_channels: int, out_channels: int) -> nn.Module:
+            return CSPSPP(in_channels, out_channels, activation=activation, norm=normalization)
+
+        def conv(in_channels: int, out_channels: int) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=normalization)
+
+        def csp(in_channels: int, out_channels: int) -> nn.Module:
+            return CSPStage(
+                in_channels,
+                out_channels,
+                depth=2,
+                shortcut=False,
+                norm=normalization,
+                activation=activation,
+            )
+
+        def out(in_channels: int) -> nn.Module:
+            conv = Conv(in_channels, in_channels, kernel_size=3, stride=1, activation=activation, norm=normalization)
+            outputs = nn.Conv2d(in_channels, num_outputs, kernel_size=1)
+            return nn.Sequential(OrderedDict([("conv", conv), (f"outputs_{num_outputs}", outputs)]))
+
+        def upsample(in_channels: int, out_channels: int) -> nn.Module:
+            channels = conv(in_channels, out_channels)
+            upsample = nn.Upsample(scale_factor=2, mode="nearest")
+            return nn.Sequential(OrderedDict([("channels", channels), ("upsample", upsample)]))
+
+        def downsample(in_channels: int, out_channels: int) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
+
+        def detect(prior_shape_idxs: Sequence[int]) -> DetectionStage:
+            assert prior_shapes is not None
+            return DetectionStage(
+                prior_shapes=prior_shapes,
+                prior_shape_idxs=list(prior_shape_idxs),
+                num_classes=num_classes,
+                input_is_normalized=False,
+                **kwargs,
+            )
+
+        if backbone is not None:
+            self.backbone = backbone
+        else:
+            self.backbone = YOLOV4Backbone(widths=widths, activation=activation, normalization=normalization)
+
+        w3 = widths[-3]
+        w4 = widths[-2]
+        w5 = widths[-1]
+
+        self.spp = spp(w5, w5)
+
+        self.pre4 = conv(w4, w4 // 2)
+        self.upsample5 = upsample(w5, w4 // 2)
+        self.fpn4 = csp(w4, w4)
+
+        self.pre3 = conv(w3, w3 // 2)
+        self.upsample4 = upsample(w4, w3 // 2)
+        self.fpn3 = csp(w3, w3)
+
+        self.downsample3 = downsample(w3, w3)
+        self.pan4 = csp(w3 + w4, w4)
+
+        self.downsample4 = downsample(w4, w4)
+        self.pan5 = csp(w4 + w5, w5)
+
+        self.out3 = out(w3)
+        self.out4 = out(w4)
+        self.out5 = out(w5)
+
+        self.detect3 = detect(range(0, anchors_per_cell))
+        self.detect4 = detect(range(anchors_per_cell, anchors_per_cell * 2))
+        self.detect5 = detect(range(anchors_per_cell * 2, anchors_per_cell * 3))
+
+    def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPUT:
+        detections: List[Tensor] = []  # Outputs from detection layers
+        losses: List[Tensor] = []  # Losses from detection layers
+        hits: List[int] = []  # Number of targets each detection layer was responsible for
+
+        image_size = get_image_size(x)
+
+        c3, c4, x = self.backbone(x)[-3:]
+        c5 = self.spp(x)
+
+        x = torch.cat((self.upsample5(c5), self.pre4(c4)), dim=1)
+        p4 = self.fpn4(x)
+        x = torch.cat((self.upsample4(p4), self.pre3(c3)), dim=1)
+        n3 = self.fpn3(x)
+        x = torch.cat((self.downsample3(n3), p4), dim=1)
+        n4 = self.pan4(x)
+        x = torch.cat((self.downsample4(n4), c5), dim=1)
+        n5 = self.pan5(x)
+
+        self.detect3(self.out3(n3), targets, image_size, detections, losses, hits)
+        self.detect4(self.out4(n4), targets, image_size, detections, losses, hits)
+        self.detect5(self.out5(n5), targets, image_size, detections, losses, hits)
+        return detections, losses, hits
+
+
+class YOLOV4P6Network(nn.Module):
+    """Network architecture that corresponds approximately to the variant of YOLOv4 with four detection layers.
+
+    Args:
+        num_classes: Number of different classes that this model predicts.
+        backbone: A backbone network that returns the output from each stage.
+        widths: Number of channels at each network stage.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+        prior_shapes: A list of prior box dimensions, used for scaling the predicted dimensions and possibly for
+            matching the targets to the anchors. The list should contain [width, height] pairs in the network input
+            resolution. There should be `4N` pairs, where `N` is the number of anchors per spatial location. They are
+            assigned to the layers from the lowest (high-resolution) to the highest (low-resolution) layer, meaning that
+            you typically want to sort the shapes from the smallest to the largest.
+        matching_algorithm: Which algorithm to use for matching targets to anchors. "simota" (the SimOTA matching rule
+            from YOLOX), "size" (match those prior shapes, whose width and height relative to the target is below given
+            ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
+            gives the highest IoU, default).
+        matching_threshold: Threshold for "size" and "iou" matching algorithms.
+        spatial_range: The "simota" matching algorithm will restrict to anchors that are within an `N x N` grid cell
+            area centered at the target, where `N` is the value of this parameter.
+        size_range: The "simota" matching algorithm will restrict to anchors whose dimensions are no more than `N` and
+            no less than `1/N` times the target dimensions, where `N` is the value of this parameter.
+        ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the prior shape has IoU
+            with some target greater than this threshold, the predictor will not be taken into account when calculating
+            the confidence loss.
+        overlap_func: Which function to use for calculating the IoU between two sets of boxes. Valid values are "iou",
+            "giou", "diou", and "ciou".
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that the target
+            confidence is 1 if there's an object, and 1.0 means that the target confidence is the output of
+            ``overlap_func``.
+        label_smoothing: The epsilon parameter (weight) for class label smoothing. 0.0 means no smoothing (binary
+            targets), and 1.0 means that the target probabilities are always 0.5.
+        overlap_loss_multiplier: Overlap loss will be scaled by this value.
+        confidence_loss_multiplier: Confidence loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
+        xy_scale: Eliminate "grid sensitivity" by scaling the box coordinates by this factor. Using a value > 1.0 helps
+            to produce coordinate values close to one.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        backbone: Optional[nn.Module] = None,
+        widths: Sequence[int] = (32, 64, 128, 256, 512, 1024, 1024),
+        activation: Optional[str] = "silu",
+        normalization: Optional[str] = "batchnorm",
+        prior_shapes: Optional[PRIOR_SHAPES] = None,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__()
+
+        # By default use the prior shapes that have been learned from the COCO data.
+        if prior_shapes is None:
+            prior_shapes = [
+                [13, 17],
+                [31, 25],
+                [24, 51],
+                [61, 45],
+                [61, 45],
+                [48, 102],
+                [119, 96],
+                [97, 189],
+                [97, 189],
+                [217, 184],
+                [171, 384],
+                [324, 451],
+                [324, 451],
+                [545, 357],
+                [616, 618],
+                [1024, 1024],
+            ]
+            anchors_per_cell = 4
+        else:
+            anchors_per_cell, modulo = divmod(len(prior_shapes), 4)
+            if modulo != 0:
+                raise ValueError("The number of provided prior shapes needs to be divisible by 4.")
+        num_outputs = (5 + num_classes) * anchors_per_cell
+
+        def spp(in_channels: int, out_channels: int) -> nn.Module:
+            return CSPSPP(in_channels, out_channels, activation=activation, norm=normalization)
+
+        def conv(in_channels: int, out_channels: int) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=normalization)
+
+        def csp(in_channels: int, out_channels: int) -> nn.Module:
+            return CSPStage(
+                in_channels,
+                out_channels,
+                depth=2,
+                shortcut=False,
+                norm=normalization,
+                activation=activation,
+            )
+
+        def out(in_channels: int) -> nn.Module:
+            conv = Conv(in_channels, in_channels, kernel_size=3, stride=1, activation=activation, norm=normalization)
+            outputs = nn.Conv2d(in_channels, num_outputs, kernel_size=1)
+            return nn.Sequential(OrderedDict([("conv", conv), (f"outputs_{num_outputs}", outputs)]))
+
+        def upsample(in_channels: int, out_channels: int) -> nn.Module:
+            channels = conv(in_channels, out_channels)
+            upsample = nn.Upsample(scale_factor=2, mode="nearest")
+            return nn.Sequential(OrderedDict([("channels", channels), ("upsample", upsample)]))
+
+        def downsample(in_channels: int, out_channels: int) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
+
+        def detect(prior_shape_idxs: Sequence[int]) -> DetectionStage:
+            assert prior_shapes is not None
+            return DetectionStage(
+                prior_shapes=prior_shapes,
+                prior_shape_idxs=list(prior_shape_idxs),
+                num_classes=num_classes,
+                input_is_normalized=False,
+                **kwargs,
+            )
+
+        if backbone is not None:
+            self.backbone = backbone
+        else:
+            self.backbone = YOLOV4Backbone(
+                widths=widths, depths=(1, 1, 3, 15, 15, 7, 7), activation=activation, normalization=normalization
+            )
+
+        w3 = widths[-4]
+        w4 = widths[-3]
+        w5 = widths[-2]
+        w6 = widths[-1]
+
+        self.spp = spp(w6, w6)
+
+        self.pre5 = conv(w5, w5 // 2)
+        self.upsample6 = upsample(w6, w5 // 2)
+        self.fpn5 = csp(w5, w5)
+
+        self.pre4 = conv(w4, w4 // 2)
+        self.upsample5 = upsample(w5, w4 // 2)
+        self.fpn4 = csp(w4, w4)
+
+        self.pre3 = conv(w3, w3 // 2)
+        self.upsample4 = upsample(w4, w3 // 2)
+        self.fpn3 = csp(w3, w3)
+
+        self.downsample3 = downsample(w3, w3)
+        self.pan4 = csp(w3 + w4, w4)
+
+        self.downsample4 = downsample(w4, w4)
+        self.pan5 = csp(w4 + w5, w5)
+
+        self.downsample5 = downsample(w5, w5)
+        self.pan6 = csp(w5 + w6, w6)
+
+        self.out3 = out(w3)
+        self.out4 = out(w4)
+        self.out5 = out(w5)
+        self.out6 = out(w6)
+
+        self.detect3 = detect(range(0, anchors_per_cell))
+        self.detect4 = detect(range(anchors_per_cell, anchors_per_cell * 2))
+        self.detect5 = detect(range(anchors_per_cell * 2, anchors_per_cell * 3))
+        self.detect6 = detect(range(anchors_per_cell * 3, anchors_per_cell * 4))
+
+    def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPUT:
+        detections: List[Tensor] = []  # Outputs from detection layers
+        losses: List[Tensor] = []  # Losses from detection layers
+        hits: List[int] = []  # Number of targets each detection layer was responsible for
+
+        image_size = get_image_size(x)
+
+        c3, c4, c5, x = self.backbone(x)[-4:]
+        c6 = self.spp(x)
+
+        x = torch.cat((self.upsample6(c6), self.pre5(c5)), dim=1)
+        p5 = self.fpn5(x)
+        x = torch.cat((self.upsample5(p5), self.pre4(c4)), dim=1)
+        p4 = self.fpn4(x)
+        x = torch.cat((self.upsample4(p4), self.pre3(c3)), dim=1)
+        n3 = self.fpn3(x)
+        x = torch.cat((self.downsample3(n3), p4), dim=1)
+        n4 = self.pan4(x)
+        x = torch.cat((self.downsample4(n4), p5), dim=1)
+        n5 = self.pan5(x)
+        x = torch.cat((self.downsample5(n5), c6), dim=1)
+        n6 = self.pan6(x)
+
+        self.detect3(self.out3(n3), targets, image_size, detections, losses, hits)
+        self.detect4(self.out4(n4), targets, image_size, detections, losses, hits)
+        self.detect5(self.out5(n5), targets, image_size, detections, losses, hits)
+        self.detect6(self.out6(n6), targets, image_size, detections, losses, hits)
+        return detections, losses, hits
+
+
+class YOLOV5Network(nn.Module):
+    """The YOLOv5 network architecture. Different variants (n/s/m/l/x) can be achieved by adjusting the ``depth``
+    and ``width`` parameters.
+
+    Args:
+        num_classes: Number of different classes that this model predicts.
+        backbone: A backbone network that returns the output from each stage.
+        width: Number of channels in the narrowest convolutional layer. The wider convolutional layers will use a number
+            of channels that is a multiple of this value. The values used by the different variants are 16 (yolov5n), 32
+            (yolov5s), 48 (yolov5m), 64 (yolov5l), and 80 (yolov5x).
+        depth: Repeat the bottleneck layers this many times. Can be used to make the network deeper. The values used by
+            the different variants are 1 (yolov5n, yolov5s), 2 (yolov5m), 3 (yolov5l), and 4 (yolov5x).
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+        prior_shapes: A list of prior box dimensions, used for scaling the predicted dimensions and possibly for
+            matching the targets to the anchors. The list should contain [width, height] pairs in the network input
+            resolution. There should be `3N` pairs, where `N` is the number of anchors per spatial location. They are
+            assigned to the layers from the lowest (high-resolution) to the highest (low-resolution) layer, meaning that
+            you typically want to sort the shapes from the smallest to the largest.
+        matching_algorithm: Which algorithm to use for matching targets to anchors. "simota" (the SimOTA matching rule
+            from YOLOX), "size" (match those prior shapes, whose width and height relative to the target is below given
+            ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
+            gives the highest IoU, default).
+        matching_threshold: Threshold for "size" and "iou" matching algorithms.
+        spatial_range: The "simota" matching algorithm will restrict to anchors that are within an `N x N` grid cell
+            area centered at the target, where `N` is the value of this parameter.
+        size_range: The "simota" matching algorithm will restrict to anchors whose dimensions are no more than `N` and
+            no less than `1/N` times the target dimensions, where `N` is the value of this parameter.
+        ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the prior shape has IoU
+            with some target greater than this threshold, the predictor will not be taken into account when calculating
+            the confidence loss.
+        overlap_func: Which function to use for calculating the IoU between two sets of boxes. Valid values are "iou",
+            "giou", "diou", and "ciou".
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that the target
+            confidence is 1 if there's an object, and 1.0 means that the target confidence is the output of
+            ``overlap_func``.
+        label_smoothing: The epsilon parameter (weight) for class label smoothing. 0.0 means no smoothing (binary
+            targets), and 1.0 means that the target probabilities are always 0.5.
+        overlap_loss_multiplier: Overlap loss will be scaled by this value.
+        confidence_loss_multiplier: Confidence loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
+        xy_scale: Eliminate "grid sensitivity" by scaling the box coordinates by this factor. Using a value > 1.0 helps
+            to produce coordinate values close to one.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        backbone: Optional[nn.Module] = None,
+        width: int = 64,
+        depth: int = 3,
+        activation: Optional[str] = "silu",
+        normalization: Optional[str] = "batchnorm",
+        prior_shapes: Optional[PRIOR_SHAPES] = None,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__()
+
+        # By default use the prior shapes that have been learned from the COCO data.
+        if prior_shapes is None:
+            prior_shapes = [
+                [12, 16],
+                [19, 36],
+                [40, 28],
+                [36, 75],
+                [76, 55],
+                [72, 146],
+                [142, 110],
+                [192, 243],
+                [459, 401],
+            ]
+            anchors_per_cell = 3
+        else:
+            anchors_per_cell, modulo = divmod(len(prior_shapes), 3)
+            if modulo != 0:
+                raise ValueError("The number of provided prior shapes needs to be divisible by 3.")
+        num_outputs = (5 + num_classes) * anchors_per_cell
+
+        def spp(in_channels: int, out_channels: int) -> nn.Module:
+            return FastSPP(in_channels, out_channels, activation=activation, norm=normalization)
+
+        def downsample(in_channels: int, out_channels: int) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
+
+        def conv(in_channels: int, out_channels: int) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=normalization)
+
+        def out(in_channels: int) -> nn.Module:
+            outputs = nn.Conv2d(in_channels, num_outputs, kernel_size=1)
+            return nn.Sequential(OrderedDict([(f"outputs_{num_outputs}", outputs)]))
+
+        def csp(in_channels: int, out_channels: int) -> nn.Module:
+            return CSPStage(
+                in_channels,
+                out_channels,
+                depth=depth,
+                shortcut=False,
+                norm=normalization,
+                activation=activation,
+            )
+
+        def detect(prior_shape_idxs: Sequence[int]) -> DetectionStage:
+            assert prior_shapes is not None
+            return DetectionStage(
+                prior_shapes=prior_shapes,
+                prior_shape_idxs=list(prior_shape_idxs),
+                num_classes=num_classes,
+                input_is_normalized=False,
+                **kwargs,
+            )
+
+        self.backbone = backbone or YOLOV5Backbone(
+            depth=depth, width=width, activation=activation, normalization=normalization
+        )
+
+        self.spp = spp(width * 16, width * 16)
+
+        self.pan3 = csp(width * 8, width * 4)
+        self.out3 = out(width * 4)
+
+        self.fpn4 = nn.Sequential(
+            OrderedDict(
+                [
+                    ("csp", csp(width * 16, width * 8)),
+                    ("conv", conv(width * 8, width * 4)),
+                ]
+            )
+        )
+        self.pan4 = csp(width * 8, width * 8)
+        self.out4 = out(width * 8)
+
+        self.fpn5 = conv(width * 16, width * 8)
+        self.pan5 = csp(width * 16, width * 16)
+        self.out5 = out(width * 16)
+
+        self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
+
+        self.downsample3 = downsample(width * 4, width * 4)
+        self.downsample4 = downsample(width * 8, width * 8)
+
+        self.detect3 = detect(range(0, anchors_per_cell))
+        self.detect4 = detect(range(anchors_per_cell, anchors_per_cell * 2))
+        self.detect5 = detect(range(anchors_per_cell * 2, anchors_per_cell * 3))
+
+    def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPUT:
+        detections: List[Tensor] = []  # Outputs from detection layers
+        losses: List[Tensor] = []  # Losses from detection layers
+        hits: List[int] = []  # Number of targets each detection layer was responsible for
+
+        image_size = get_image_size(x)
+
+        c3, c4, x = self.backbone(x)[-3:]
+        c5 = self.spp(x)
+
+        p5 = self.fpn5(c5)
+        x = torch.cat((self.upsample(p5), c4), dim=1)
+        p4 = self.fpn4(x)
+        x = torch.cat((self.upsample(p4), c3), dim=1)
+
+        n3 = self.pan3(x)
+        x = torch.cat((self.downsample3(n3), p4), dim=1)
+        n4 = self.pan4(x)
+        x = torch.cat((self.downsample4(n4), p5), dim=1)
+        n5 = self.pan5(x)
+
+        self.detect3(self.out3(n3), targets, image_size, detections, losses, hits)
+        self.detect4(self.out4(n4), targets, image_size, detections, losses, hits)
+        self.detect5(self.out5(n5), targets, image_size, detections, losses, hits)
+        return detections, losses, hits
+
+
+class YOLOV7Network(nn.Module):
+    """Network architecture that corresponds to the W6 variant of YOLOv7 with four detection layers.
+
+    Args:
+        num_classes: Number of different classes that this model predicts.
+        backbone: A backbone network that returns the output from each stage.
+        widths: Number of channels at each network stage.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+        prior_shapes: A list of prior box dimensions, used for scaling the predicted dimensions and possibly for
+            matching the targets to the anchors. The list should contain [width, height] pairs in the network input
+            resolution. There should be `4N` pairs, where `N` is the number of anchors per spatial location. They are
+            assigned to the layers from the lowest (high-resolution) to the highest (low-resolution) layer, meaning that
+            you typically want to sort the shapes from the smallest to the largest.
+        matching_algorithm: Which algorithm to use for matching targets to anchors. "simota" (the SimOTA matching rule
+            from YOLOX), "size" (match those prior shapes, whose width and height relative to the target is below given
+            ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
+            gives the highest IoU, default).
+        matching_threshold: Threshold for "size" and "iou" matching algorithms.
+        spatial_range: The "simota" matching algorithm will restrict to anchors that are within an `N x N` grid cell
+            area centered at the target. This parameter specifies `N` for the lead head.
+        aux_spatial_range: The "simota" matching algorithm will restrict to anchors that are within an `N x N` grid cell
+            area centered at the target. This parameter specifies `N` for the auxiliary head.
+        size_range: The "simota" matching algorithm will restrict to anchors whose dimensions are no more than `N` and
+            no less than `1/N` times the target dimensions, where `N` is the value of this parameter.
+        ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the prior shape has IoU
+            with some target greater than this threshold, the predictor will not be taken into account when calculating
+            the confidence loss.
+        overlap_func: Which function to use for calculating the IoU between two sets of boxes. Valid values are "iou",
+            "giou", "diou", and "ciou".
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that the target
+            confidence is 1 if there's an object, and 1.0 means that the target confidence is the output of
+            ``overlap_func``.
+        label_smoothing: The epsilon parameter (weight) for class label smoothing. 0.0 means no smoothing (binary
+            targets), and 1.0 means that the target probabilities are always 0.5.
+        overlap_loss_multiplier: Overlap loss will be scaled by this value.
+        confidence_loss_multiplier: Confidence loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
+        xy_scale: Eliminate "grid sensitivity" by scaling the box coordinates by this factor. Using a value > 1.0 helps
+            to produce coordinate values close to one.
+        aux_weight: Weight for the loss from the auxiliary heads.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        backbone: Optional[nn.Module] = None,
+        widths: Sequence[int] = (64, 128, 256, 512, 768, 1024),
+        activation: Optional[str] = "silu",
+        normalization: Optional[str] = "batchnorm",
+        prior_shapes: Optional[PRIOR_SHAPES] = None,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__()
+
+        # By default use the prior shapes that have been learned from the COCO data.
+        if prior_shapes is None:
+            prior_shapes = [
+                [13, 17],
+                [31, 25],
+                [24, 51],
+                [61, 45],
+                [61, 45],
+                [48, 102],
+                [119, 96],
+                [97, 189],
+                [97, 189],
+                [217, 184],
+                [171, 384],
+                [324, 451],
+                [324, 451],
+                [545, 357],
+                [616, 618],
+                [1024, 1024],
+            ]
+            anchors_per_cell = 4
+        else:
+            anchors_per_cell, modulo = divmod(len(prior_shapes), 4)
+            if modulo != 0:
+                raise ValueError("The number of provided prior shapes needs to be divisible by 4.")
+        num_outputs = (5 + num_classes) * anchors_per_cell
+
+        def spp(in_channels: int, out_channels: int) -> nn.Module:
+            return CSPSPP(in_channels, out_channels, activation=activation, norm=normalization)
+
+        def conv(in_channels: int, out_channels: int) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=normalization)
+
+        def elan(in_channels: int, out_channels: int) -> nn.Module:
+            return ELANStage(
+                in_channels,
+                out_channels,
+                split_channels=out_channels,
+                depth=4,
+                block_depth=1,
+                norm=normalization,
+                activation=activation,
+            )
+
+        def out(in_channels: int, hidden_channels: int) -> nn.Module:
+            conv = Conv(
+                in_channels, hidden_channels, kernel_size=3, stride=1, activation=activation, norm=normalization
+            )
+            outputs = nn.Conv2d(hidden_channels, num_outputs, kernel_size=1)
+            return nn.Sequential(OrderedDict([("conv", conv), (f"outputs_{num_outputs}", outputs)]))
+
+        def upsample(in_channels: int, out_channels: int) -> nn.Module:
+            channels = conv(in_channels, out_channels)
+            upsample = nn.Upsample(scale_factor=2, mode="nearest")
+            return nn.Sequential(OrderedDict([("channels", channels), ("upsample", upsample)]))
+
+        def downsample(in_channels: int, out_channels: int) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
+
+        def detect(prior_shape_idxs: Sequence[int]) -> DetectionStageWithAux:
+            assert prior_shapes is not None
+            return DetectionStageWithAux(
+                prior_shapes=prior_shapes,
+                prior_shape_idxs=list(prior_shape_idxs),
+                num_classes=num_classes,
+                input_is_normalized=False,
+                **kwargs,
+            )
+
+        if backbone is not None:
+            self.backbone = backbone
+        else:
+            self.backbone = YOLOV7Backbone(
+                widths=widths, depth=2, block_depth=2, activation=activation, normalization=normalization
+            )
+
+        w3 = widths[-4]
+        w4 = widths[-3]
+        w5 = widths[-2]
+        w6 = widths[-1]
+
+        self.spp = spp(w6, w6 // 2)
+
+        self.pre5 = conv(w5, w5 // 2)
+        self.upsample6 = upsample(w6 // 2, w5 // 2)
+        self.fpn5 = elan(w5, w5 // 2)
+
+        self.pre4 = conv(w4, w4 // 2)
+        self.upsample5 = upsample(w5 // 2, w4 // 2)
+        self.fpn4 = elan(w4, w4 // 2)
+
+        self.pre3 = conv(w3, w3 // 2)
+        self.upsample4 = upsample(w4 // 2, w3 // 2)
+        self.fpn3 = elan(w3, w3 // 2)
+
+        self.downsample3 = downsample(w3 // 2, w4 // 2)
+        self.pan4 = elan(w4, w4 // 2)
+
+        self.downsample4 = downsample(w4 // 2, w5 // 2)
+        self.pan5 = elan(w5, w5 // 2)
+
+        self.downsample5 = downsample(w5 // 2, w6 // 2)
+        self.pan6 = elan(w6, w6 // 2)
+
+        self.out3 = out(w3 // 2, w3)
+        self.aux_out3 = out(w3 // 2, w3 + (w3 // 4))
+        self.out4 = out(w4 // 2, w4)
+        self.aux_out4 = out(w4 // 2, w4 + (w4 // 4))
+        self.out5 = out(w5 // 2, w5)
+        self.aux_out5 = out(w5 // 2, w5 + (w5 // 4))
+        self.out6 = out(w6 // 2, w6)
+        self.aux_out6 = out(w6 // 2, w6 + (w6 // 4))
+
+        self.detect3 = detect(range(0, anchors_per_cell))
+        self.detect4 = detect(range(anchors_per_cell, anchors_per_cell * 2))
+        self.detect5 = detect(range(anchors_per_cell * 2, anchors_per_cell * 3))
+        self.detect6 = detect(range(anchors_per_cell * 3, anchors_per_cell * 4))
+
+    def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPUT:
+        detections: List[Tensor] = []  # Outputs from detection layers
+        losses: List[Tensor] = []  # Losses from detection layers
+        hits: List[int] = []  # Number of targets each detection layer was responsible for
+
+        image_size = get_image_size(x)
+
+        c3, c4, c5, x = self.backbone(x)[-4:]
+        c6 = self.spp(x)
+
+        x = torch.cat((self.upsample6(c6), self.pre5(c5)), dim=1)
+        p5 = self.fpn5(x)
+        x = torch.cat((self.upsample5(p5), self.pre4(c4)), dim=1)
+        p4 = self.fpn4(x)
+        x = torch.cat((self.upsample4(p4), self.pre3(c3)), dim=1)
+        n3 = self.fpn3(x)
+        x = torch.cat((self.downsample3(n3), p4), dim=1)
+        n4 = self.pan4(x)
+        x = torch.cat((self.downsample4(n4), p5), dim=1)
+        n5 = self.pan5(x)
+        x = torch.cat((self.downsample5(n5), c6), dim=1)
+        n6 = self.pan6(x)
+
+        self.detect3(self.out3(n3), self.aux_out3(n3), targets, image_size, detections, losses, hits)
+        self.detect4(self.out4(n4), self.aux_out4(p4), targets, image_size, detections, losses, hits)
+        self.detect5(self.out5(n5), self.aux_out5(p5), targets, image_size, detections, losses, hits)
+        self.detect6(self.out6(n6), self.aux_out6(c6), targets, image_size, detections, losses, hits)
+        return detections, losses, hits
+
+
+class YOLOXHead(nn.Module):
+    """A module that produces features for YOLO detection layer, decoupling the classification and localization
+    features.
+
+    Args:
+        in_channels: Number of input channels that the module expects.
+        hidden_channels: Number of output channels in the hidden layers.
+        anchors_per_cell: Number of detections made at each spatial location of the feature map.
+        num_classes: Number of different classes that this model predicts.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        norm: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        hidden_channels: int,
+        anchors_per_cell: int,
+        num_classes: int,
+        activation: Optional[str] = "silu",
+        norm: Optional[str] = "batchnorm",
+    ) -> None:
+        super().__init__()
+
+        def conv(in_channels: int, out_channels: int, kernel_size: int = 1) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size, stride=1, activation=activation, norm=norm)
+
+        def linear(in_channels: int, out_channels: int) -> nn.Module:
+            return nn.Conv2d(in_channels, out_channels, kernel_size=1)
+
+        def features(num_channels: int) -> nn.Module:
+            return nn.Sequential(
+                conv(num_channels, num_channels, kernel_size=3),
+                conv(num_channels, num_channels, kernel_size=3),
+            )
+
+        def classprob(num_channels: int) -> nn.Module:
+            num_outputs = anchors_per_cell * num_classes
+            outputs = linear(num_channels, num_outputs)
+            return nn.Sequential(OrderedDict([("convs", features(num_channels)), (f"outputs_{num_outputs}", outputs)]))
+
+        self.stem = conv(in_channels, hidden_channels)
+        self.feat = features(hidden_channels)
+        self.box = linear(hidden_channels, anchors_per_cell * 4)
+        self.confidence = linear(hidden_channels, anchors_per_cell)
+        self.classprob = classprob(hidden_channels)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.stem(x)
+        features = self.feat(x)
+        box = self.box(features)
+        confidence = self.confidence(features)
+        classprob = self.classprob(x)
+        return torch.cat((box, confidence, classprob), dim=1)
+
+
+class YOLOXNetwork(nn.Module):
+    """The YOLOX network architecture. Different variants (nano/tiny/s/m/l/x) can be achieved by adjusting the
+    ``depth`` and ``width`` parameters.
+
+    Args:
+        num_classes: Number of different classes that this model predicts.
+        backbone: A backbone network that returns the output from each stage.
+        width: Number of channels in the narrowest convolutional layer. The wider convolutional layers will use a number
+            of channels that is a multiple of this value. The values used by the different variants are 24 (yolox-tiny),
+            32 (yolox-s), 48 (yolox-m), and 64 (yolox-l).
+        depth: Repeat the bottleneck layers this many times. Can be used to make the network deeper. The values used by
+            the different variants are 1 (yolox-tiny, yolox-s), 2 (yolox-m), and 3 (yolox-l).
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+        prior_shapes: A list of prior box dimensions, used for scaling the predicted dimensions and possibly for
+            matching the targets to the anchors. The list should contain [width, height] pairs in the network input
+            resolution. There should be `3N` pairs, where `N` is the number of anchors per spatial location. They are
+            assigned to the layers from the lowest (high-resolution) to the highest (low-resolution) layer, meaning that
+            you typically want to sort the shapes from the smallest to the largest.
+        matching_algorithm: Which algorithm to use for matching targets to anchors. "simota" (the SimOTA matching rule
+            from YOLOX), "size" (match those prior shapes, whose width and height relative to the target is below given
+            ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
+            gives the highest IoU, default).
+        matching_threshold: Threshold for "size" and "iou" matching algorithms.
+        spatial_range: The "simota" matching algorithm will restrict to anchors that are within an `N x N` grid cell
+            area centered at the target, where `N` is the value of this parameter.
+        size_range: The "simota" matching algorithm will restrict to anchors whose dimensions are no more than `N` and
+            no less than `1/N` times the target dimensions, where `N` is the value of this parameter.
+        ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the prior shape has IoU
+            with some target greater than this threshold, the predictor will not be taken into account when calculating
+            the confidence loss.
+        overlap_func: Which function to use for calculating the IoU between two sets of boxes. Valid values are "iou",
+            "giou", "diou", and "ciou".
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that the target
+            confidence is 1 if there's an object, and 1.0 means that the target confidence is the output of
+            ``overlap_func``.
+        label_smoothing: The epsilon parameter (weight) for class label smoothing. 0.0 means no smoothing (binary
+            targets), and 1.0 means that the target probabilities are always 0.5.
+        overlap_loss_multiplier: Overlap loss will be scaled by this value.
+        confidence_loss_multiplier: Confidence loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
+        xy_scale: Eliminate "grid sensitivity" by scaling the box coordinates by this factor. Using a value > 1.0 helps
+            to produce coordinate values close to one.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        backbone: Optional[nn.Module] = None,
+        width: int = 64,
+        depth: int = 3,
+        activation: Optional[str] = "silu",
+        normalization: Optional[str] = "batchnorm",
+        prior_shapes: Optional[PRIOR_SHAPES] = None,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__()
+
+        # By default use one anchor per cell and the stride as the prior size.
+        if prior_shapes is None:
+            prior_shapes = [[8, 8], [16, 16], [32, 32]]
+            anchors_per_cell = 1
+        else:
+            anchors_per_cell, modulo = divmod(len(prior_shapes), 3)
+            if modulo != 0:
+                raise ValueError("The number of provided prior shapes needs to be divisible by 3.")
+
+        def spp(in_channels: int, out_channels: int) -> nn.Module:
+            return FastSPP(in_channels, out_channels, activation=activation, norm=normalization)
+
+        def downsample(in_channels: int, out_channels: int) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
+
+        def conv(in_channels: int, out_channels: int, kernel_size: int = 1) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size, stride=1, activation=activation, norm=normalization)
+
+        def csp(in_channels: int, out_channels: int) -> nn.Module:
+            return CSPStage(
+                in_channels,
+                out_channels,
+                depth=depth,
+                shortcut=False,
+                norm=normalization,
+                activation=activation,
+            )
+
+        def head(in_channels: int, hidden_channels: int) -> YOLOXHead:
+            return YOLOXHead(
+                in_channels,
+                hidden_channels,
+                anchors_per_cell,
+                num_classes,
+                activation=activation,
+                norm=normalization,
+            )
+
+        def detect(prior_shape_idxs: Sequence[int]) -> DetectionStage:
+            assert prior_shapes is not None
+            return DetectionStage(
+                prior_shapes=prior_shapes,
+                prior_shape_idxs=list(prior_shape_idxs),
+                num_classes=num_classes,
+                input_is_normalized=False,
+                **kwargs,
+            )
+
+        self.backbone = backbone or YOLOV5Backbone(
+            depth=depth, width=width, activation=activation, normalization=normalization
+        )
+
+        self.spp = spp(width * 16, width * 16)
+
+        self.pan3 = csp(width * 8, width * 4)
+        self.out3 = head(width * 4, width * 4)
+
+        self.fpn4 = nn.Sequential(
+            OrderedDict(
+                [
+                    ("csp", csp(width * 16, width * 8)),
+                    ("conv", conv(width * 8, width * 4)),
+                ]
+            )
+        )
+        self.pan4 = csp(width * 8, width * 8)
+        self.out4 = head(width * 8, width * 4)
+
+        self.fpn5 = conv(width * 16, width * 8)
+        self.pan5 = csp(width * 16, width * 16)
+        self.out5 = head(width * 16, width * 4)
+
+        self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
+
+        self.downsample3 = downsample(width * 4, width * 4)
+        self.downsample4 = downsample(width * 8, width * 8)
+
+        self.detect3 = detect(range(0, anchors_per_cell))
+        self.detect4 = detect(range(anchors_per_cell, anchors_per_cell * 2))
+        self.detect5 = detect(range(anchors_per_cell * 2, anchors_per_cell * 3))
+
+    def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPUT:
+        detections: List[Tensor] = []  # Outputs from detection layers
+        losses: List[Tensor] = []  # Losses from detection layers
+        hits: List[int] = []  # Number of targets each detection layer was responsible for
+
+        image_size = get_image_size(x)
+
+        c3, c4, x = self.backbone(x)[-3:]
+        c5 = self.spp(x)
+
+        p5 = self.fpn5(c5)
+        x = torch.cat((self.upsample(p5), c4), dim=1)
+        p4 = self.fpn4(x)
+        x = torch.cat((self.upsample(p4), c3), dim=1)
+
+        n3 = self.pan3(x)
+        x = torch.cat((self.downsample3(n3), p4), dim=1)
+        n4 = self.pan4(x)
+        x = torch.cat((self.downsample4(n4), p5), dim=1)
+        n5 = self.pan5(x)
+
+        self.detect3(self.out3(n3), targets, image_size, detections, losses, hits)
+        self.detect4(self.out4(n4), targets, image_size, detections, losses, hits)
+        self.detect5(self.out5(n5), targets, image_size, detections, losses, hits)
+        return detections, losses, hits
+
+
+class DarknetNetwork(nn.Module):
+    """This class can be used to parse the configuration files of the Darknet YOLOv4 implementation.
+
+    Iterates through the layers from the configuration and creates corresponding PyTorch modules. If ``weights_path`` is
+    given and points to a Darknet model file, loads the convolutional layer weights from the file.
+
+    Args:
+        config_path: Path to a Darknet configuration file that defines the network architecture.
+        weights_path: Path to a Darknet model file. If given, the model weights will be read from this file.
+        in_channels: Number of channels in the input image.
+        matching_algorithm: Which algorithm to use for matching targets to anchors. "simota" (the SimOTA matching rule
+            from YOLOX), "size" (match those prior shapes, whose width and height relative to the target is below given
+            ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
+            gives the highest IoU, default).
+        matching_threshold: Threshold for "size" and "iou" matching algorithms.
+        spatial_range: The "simota" matching algorithm will restrict to anchors that are within an `N x N` grid cell
+            area centered at the target, where `N` is the value of this parameter.
+        size_range: The "simota" matching algorithm will restrict to anchors whose dimensions are no more than `N` and
+            no less than `1/N` times the target dimensions, where `N` is the value of this parameter.
+        ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the corresponding anchor
+            has IoU with some target greater than this threshold, the predictor will not be taken into account when
+            calculating the confidence loss.
+        overlap_func: Which function to use for calculating the IoU between two sets of boxes. Valid values are "iou",
+            "giou", "diou", and "ciou".
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that the target
+            confidence is 1 if there's an object, and 1.0 means that the target confidence is the output of
+            ``overlap_func``.
+        label_smoothing: The epsilon parameter (weight) for class label smoothing. 0.0 means no smoothing (binary
+            targets), and 1.0 means that the target probabilities are always 0.5.
+        overlap_loss_multiplier: Overlap loss will be scaled by this value.
+        confidence_loss_multiplier: Confidence loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
+    """
+
+    def __init__(
+        self, config_path: str, weights_path: Optional[str] = None, in_channels: Optional[int] = None, **kwargs: Any
+    ) -> None:
+        super().__init__()
+
+        with open(config_path) as config_file:
+            sections = self._read_config(config_file)
+
+        if len(sections) < 2:
+            raise ValueError("The model configuration file should include at least two sections.")
+
+        self.__dict__.update(sections[0])
+        global_config = sections[0]
+        layer_configs = sections[1:]
+
+        if in_channels is None:
+            in_channels = global_config.get("channels", 3)
+            assert isinstance(in_channels, int)
+
+        self.layers = nn.ModuleList()
+        # num_inputs will contain the number of channels in the input of every layer up to the current layer. It is
+        # initialized with the number of channels in the input image.
+        num_inputs = [in_channels]
+        for layer_config in layer_configs:
+            config = {**global_config, **layer_config}
+            layer, num_outputs = _create_layer(config, num_inputs, **kwargs)
+            self.layers.append(layer)
+            num_inputs.append(num_outputs)
+
+        if weights_path is not None:
+            with open(weights_path) as weight_file:
+                self.load_weights(weight_file)
+
+        # A workaround for TorchScript compilation. For some reason, the compilation will crash with "Unknown type name
+        # 'ShortcutLayer'" without this.
+        self._ = ShortcutLayer(0)
+
+    def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPUT:
+        outputs: List[Tensor] = []  # Outputs from all layers
+        detections: List[Tensor] = []  # Outputs from detection layers
+        losses: List[Tensor] = []  # Losses from detection layers
+        hits: List[int] = []  # Number of targets each detection layer was responsible for
+
+        image_size = get_image_size(x)
+
+        for layer in self.layers:
+            if isinstance(layer, (RouteLayer, ShortcutLayer)):
+                x = layer(outputs)
+            elif isinstance(layer, DetectionLayer):
+                x, preds = layer(x, image_size)
+                detections.append(x)
+                if targets is not None:
+                    layer_losses, layer_hits = layer.calculate_losses(preds, targets, image_size)
+                    losses.append(layer_losses)
+                    hits.append(layer_hits)
+            else:
+                x = layer(x)
+
+            outputs.append(x)
+
+        return detections, losses, hits
+
+    def load_weights(self, weight_file: io.IOBase) -> None:
+        """Loads weights to layer modules from a pretrained Darknet model.
+
+        One may want to continue training from pretrained weights, on a dataset with a different number of object
+        categories. The number of kernels in the convolutional layers just before each detection layer depends on the
+        number of output classes. The Darknet solution is to truncate the weight file and stop reading weights at the
+        first incompatible layer. For this reason the function silently leaves the rest of the layers unchanged, when
+        the weight file ends.
+
+        Args:
+            weight_file: A file-like object containing model weights in the Darknet binary format.
+        """
+        if not isinstance(weight_file, io.IOBase):
+            raise ValueError("weight_file must be a file-like object.")
+
+        version = np.fromfile(weight_file, count=3, dtype=np.int32)
+        images_seen = np.fromfile(weight_file, count=1, dtype=np.int64)
+        print(
+            f"Loading weights from Darknet model version {version[0]}.{version[1]}.{version[2]} "
+            f"that has been trained on {images_seen[0]} images."
+        )
+
+        def read(tensor: Tensor) -> int:
+            """Reads the contents of ``tensor`` from the current position of ``weight_file``.
+
+            Returns the number of elements read. If there's no more data in ``weight_file``, returns 0.
+            """
+            np_array = np.fromfile(weight_file, count=tensor.numel(), dtype=np.float32)
+            num_elements = np_array.size
+            if num_elements > 0:
+                source = torch.from_numpy(np_array).view_as(tensor)
+                with torch.no_grad():
+                    tensor.copy_(source)
+            return num_elements
+
+        for layer in self.layers:
+            # Weights are loaded only to convolutional layers
+            if not isinstance(layer, Conv):
+                continue
+
+            # If convolution is followed by batch normalization, read the batch normalization parameters. Otherwise we
+            # read the convolution bias.
+            if isinstance(layer.norm, nn.Identity):
+                assert layer.conv.bias is not None
+                read(layer.conv.bias)
+            else:
+                assert isinstance(layer.norm, nn.BatchNorm2d)
+                assert layer.norm.running_mean is not None
+                assert layer.norm.running_var is not None
+                read(layer.norm.bias)
+                read(layer.norm.weight)
+                read(layer.norm.running_mean)
+                read(layer.norm.running_var)
+
+            read_count = read(layer.conv.weight)
+            if read_count == 0:
+                return
+
+    def _read_config(self, config_file: Iterable[str]) -> List[Dict[str, Any]]:
+        """Reads a Darnet network configuration file and returns a list of configuration sections.
+
+        Args:
+            config_file: The configuration file to read.
+
+        Returns:
+            A list of configuration sections.
+        """
+        section_re = re.compile(r"\[([^]]+)\]")
+        list_variables = ("layers", "anchors", "mask", "scales")
+        variable_types = {
+            "activation": str,
+            "anchors": int,
+            "angle": float,
+            "batch": int,
+            "batch_normalize": bool,
+            "beta_nms": float,
+            "burn_in": int,
+            "channels": int,
+            "classes": int,
+            "cls_normalizer": float,
+            "decay": float,
+            "exposure": float,
+            "filters": int,
+            "from": int,
+            "groups": int,
+            "group_id": int,
+            "height": int,
+            "hue": float,
+            "ignore_thresh": float,
+            "iou_loss": str,
+            "iou_normalizer": float,
+            "iou_thresh": float,
+            "jitter": float,
+            "layers": int,
+            "learning_rate": float,
+            "mask": int,
+            "max_batches": int,
+            "max_delta": float,
+            "momentum": float,
+            "mosaic": bool,
+            "new_coords": int,
+            "nms_kind": str,
+            "num": int,
+            "obj_normalizer": float,
+            "pad": bool,
+            "policy": str,
+            "random": bool,
+            "resize": float,
+            "saturation": float,
+            "scales": float,
+            "scale_x_y": float,
+            "size": int,
+            "steps": str,
+            "stride": int,
+            "subdivisions": int,
+            "truth_thresh": float,
+            "width": int,
+        }
+
+        section = None
+        sections = []
+
+        def convert(key: str, value: str) -> Union[str, int, float, List[Union[str, int, float]]]:
+            """Converts a value to the correct type based on key."""
+            if key not in variable_types:
+                warn("Unknown YOLO configuration variable: " + key)
+                return value
+            if key in list_variables:
+                return [variable_types[key](v) for v in value.split(",")]
+            else:
+                return variable_types[key](value)
+
+        for line in config_file:
+            line = line.strip()
+            if (not line) or (line[0] == "#"):
+                continue
+
+            section_match = section_re.match(line)
+            if section_match:
+                if section is not None:
+                    sections.append(section)
+                section = {"type": section_match.group(1)}
+            else:
+                if section is None:
+                    raise RuntimeError("Darknet network configuration file does not start with a section header.")
+                key, value = line.split("=")
+                key = key.rstrip()
+                value = value.lstrip()
+                section[key] = convert(key, value)
+        if section is not None:
+            sections.append(section)
+
+        return sections
+
+
+def _create_layer(config: DARKNET_CONFIG, num_inputs: List[int], **kwargs: Any) -> CREATE_LAYER_OUTPUT:
+    """Calls one of the ``_create_<layertype>(config, num_inputs)`` functions to create a PyTorch module from the
+    layer config.
+
+    Args:
+        config: Dictionary of configuration options for this layer.
+        num_inputs: Number of channels in the input of every layer up to this layer.
+
+    Returns:
+        module (:class:`~torch.nn.Module`), num_outputs (int): The created PyTorch module and the number of channels in
+        its output.
+    """
+    create_func: Dict[str, Callable[..., CREATE_LAYER_OUTPUT]] = {
+        "convolutional": _create_convolutional,
+        "maxpool": _create_maxpool,
+        "route": _create_route,
+        "shortcut": _create_shortcut,
+        "upsample": _create_upsample,
+        "yolo": _create_yolo,
+    }
+    return create_func[config["type"]](config, num_inputs, **kwargs)
+
+
+def _create_convolutional(config: DARKNET_CONFIG, num_inputs: List[int], **kwargs: Any) -> CREATE_LAYER_OUTPUT:
+    """Creates a convolutional layer.
+
+    Args:
+        config: Dictionary of configuration options for this layer.
+        num_inputs: Number of channels in the input of every layer up to this layer.
+
+    Returns:
+        module (:class:`~torch.nn.Module`), num_outputs (int): The created PyTorch module and the number of channels in
+        its output.
+    """
+    batch_normalize = config.get("batch_normalize", False)
+    padding = (config["size"] - 1) // 2 if config["pad"] else 0
+
+    layer = Conv(
+        num_inputs[-1],
+        config["filters"],
+        kernel_size=config["size"],
+        stride=config["stride"],
+        padding=padding,
+        bias=not batch_normalize,
+        activation=config["activation"],
+        norm="batchnorm" if batch_normalize else None,
+    )
+    return layer, config["filters"]
+
+
+def _create_maxpool(config: DARKNET_CONFIG, num_inputs: List[int], **kwargs: Any) -> CREATE_LAYER_OUTPUT:
+    """Creates a max pooling layer.
+
+    Padding is added so that the output resolution will be the input resolution divided by stride, rounded upwards.
+
+    Args:
+        config: Dictionary of configuration options for this layer.
+        num_inputs: Number of channels in the input of every layer up to this layer.
+
+    Returns:
+        module (:class:`~torch.nn.Module`), num_outputs (int): The created PyTorch module and the number of channels in
+        its output.
+    """
+    layer = MaxPool(config["size"], config["stride"])
+    return layer, num_inputs[-1]
+
+
+def _create_route(config: DARKNET_CONFIG, num_inputs: List[int], **kwargs: Any) -> CREATE_LAYER_OUTPUT:
+    """Creates a routing layer.
+
+    A routing layer concatenates the output (or part of it) from the layers specified by the "layers" configuration
+    option.
+
+    Args:
+        config: Dictionary of configuration options for this layer.
+        num_inputs: Number of channels in the input of every layer up to this layer.
+
+    Returns:
+        module (:class:`~torch.nn.Module`), num_outputs (int): The created PyTorch module and the number of channels in
+        its output.
+    """
+    num_chunks = config.get("groups", 1)
+    chunk_idx = config.get("group_id", 0)
+
+    # 0 is the first layer, -1 is the previous layer
+    last = len(num_inputs) - 1
+    source_layers = [layer if layer >= 0 else last + layer for layer in config["layers"]]
+
+    layer = RouteLayer(source_layers, num_chunks, chunk_idx)
+
+    # The number of outputs of a source layer is the number of inputs of the next layer.
+    num_outputs = sum(num_inputs[layer + 1] // num_chunks for layer in source_layers)
+
+    return layer, num_outputs
+
+
+def _create_shortcut(config: DARKNET_CONFIG, num_inputs: List[int], **kwargs: Any) -> CREATE_LAYER_OUTPUT:
+    """Creates a shortcut layer.
+
+    A shortcut layer adds a residual connection from the layer specified by the "from" configuration option.
+
+    Args:
+        config: Dictionary of configuration options for this layer.
+        num_inputs: Number of channels in the input of every layer up to this layer.
+
+    Returns:
+        module (:class:`~torch.nn.Module`), num_outputs (int): The created PyTorch module and the number of channels in
+        its output.
+    """
+    layer = ShortcutLayer(config["from"])
+    return layer, num_inputs[-1]
+
+
+def _create_upsample(config: DARKNET_CONFIG, num_inputs: List[int], **kwargs: Any) -> CREATE_LAYER_OUTPUT:
+    """Creates a layer that upsamples the data.
+
+    Args:
+        config: Dictionary of configuration options for this layer.
+        num_inputs: Number of channels in the input of every layer up to this layer.
+
+    Returns:
+        module (:class:`~torch.nn.Module`), num_outputs (int): The created PyTorch module and the number of channels in
+        its output.
+    """
+    layer = nn.Upsample(scale_factor=config["stride"], mode="nearest")
+    return layer, num_inputs[-1]
+
+
+def _create_yolo(
+    config: DARKNET_CONFIG,
+    num_inputs: List[int],
+    prior_shapes: Optional[PRIOR_SHAPES] = None,
+    matching_algorithm: Optional[str] = None,
+    matching_threshold: Optional[float] = None,
+    spatial_range: float = 5.0,
+    size_range: float = 4.0,
+    ignore_bg_threshold: Optional[float] = None,
+    overlap_func: Optional[str] = None,
+    predict_overlap: Optional[float] = None,
+    label_smoothing: Optional[float] = None,
+    overlap_loss_multiplier: Optional[float] = None,
+    confidence_loss_multiplier: Optional[float] = None,
+    class_loss_multiplier: Optional[float] = None,
+    **kwargs: Any,
+) -> CREATE_LAYER_OUTPUT:
+    """Creates a YOLO detection layer.
+
+    Args:
+        config: Dictionary of configuration options for this layer.
+        num_inputs: Number of channels in the input of every layer up to this layer. Not used by the detection layer.
+        prior_shapes: A list of prior box dimensions, used for scaling the predicted dimensions and possibly for
+            matching the targets to the anchors. The list should contain [width, height] pairs in the network input
+            resolution. There should be `M x N` pairs, where `M` is the number of detection layers and `N` is the number
+            of anchors per spatial location. They are assigned to the layers from the lowest (high-resolution) to the
+            highest (low-resolution) layer, meaning that you typically want to sort the shapes from the smallest to the
+            largest.
+        matching_algorithm: Which algorithm to use for matching targets to anchors. "simota" (the SimOTA matching rule
+            from YOLOX), "size" (match those prior shapes, whose width and height relative to the target is below given
+            ratio), "iou" (match all prior shapes that give a high enough IoU), or "maxiou" (match the prior shape that
+            gives the highest IoU, default).
+        matching_threshold: Threshold for "size" and "iou" matching algorithms.
+        spatial_range: The "simota" matching algorithm will restrict to anchors that are within an `N x N` grid cell
+            area centered at the target, where `N` is the value of this parameter.
+        size_range: The "simota" matching algorithm will restrict to anchors whose dimensions are no more than `N` and
+            no less than `1/N` times the target dimensions, where `N` is the value of this parameter.
+        ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the corresponding anchor
+            has IoU with some target greater than this threshold, the predictor will not be taken into account when
+            calculating the confidence loss.
+        overlap_func: Which function to use for calculating the IoU between two sets of boxes. Valid values are "iou",
+            "giou", "diou", and "ciou".
+        predict_overlap: Balance between binary confidence targets and predicting the overlap. 0.0 means that the target
+            confidence is 1 if there's an object, and 1.0 means that the target confidence is the output of
+            ``overlap_func``.
+        label_smoothing: The epsilon parameter (weight) for class label smoothing. 0.0 means no smoothing (binary
+            targets), and 1.0 means that the target probabilities are always 0.5.
+        overlap_loss_multiplier: Overlap loss will be scaled by this value.
+        confidence_loss_multiplier: Confidence loss will be scaled by this value.
+        class_loss_multiplier: Classification loss will be scaled by this value.
+
+    Returns:
+        module (:class:`~torch.nn.Module`), num_outputs (int): The created PyTorch module and the number of channels in
+        its output (always 0 for a detection layer).
+    """
+    if prior_shapes is None:
+        # The "anchors" list alternates width and height.
+        dims = config["anchors"]
+        prior_shapes = [[dims[i], dims[i + 1]] for i in range(0, len(dims), 2)]
+    if ignore_bg_threshold is None:
+        ignore_bg_threshold = config.get("ignore_thresh", 1.0)
+        assert isinstance(ignore_bg_threshold, float)
+    if overlap_func is None:
+        overlap_func = config.get("iou_loss", "iou")
+        assert isinstance(overlap_func, str)
+    if overlap_loss_multiplier is None:
+        overlap_loss_multiplier = config.get("iou_normalizer", 1.0)
+        assert isinstance(overlap_loss_multiplier, float)
+    if confidence_loss_multiplier is None:
+        confidence_loss_multiplier = config.get("obj_normalizer", 1.0)
+        assert isinstance(confidence_loss_multiplier, float)
+    if class_loss_multiplier is None:
+        class_loss_multiplier = config.get("cls_normalizer", 1.0)
+        assert isinstance(class_loss_multiplier, float)
+
+    layer = create_detection_layer(
+        num_classes=config["classes"],
+        prior_shapes=prior_shapes,
+        prior_shape_idxs=config["mask"],
+        matching_algorithm=matching_algorithm,
+        matching_threshold=matching_threshold,
+        spatial_range=spatial_range,
+        size_range=size_range,
+        ignore_bg_threshold=ignore_bg_threshold,
+        overlap_func=overlap_func,
+        predict_overlap=predict_overlap,
+        label_smoothing=label_smoothing,
+        overlap_loss_multiplier=overlap_loss_multiplier,
+        confidence_loss_multiplier=confidence_loss_multiplier,
+        class_loss_multiplier=class_loss_multiplier,
+        xy_scale=config.get("scale_x_y", 1.0),
+        input_is_normalized=config.get("new_coords", 0) > 0,
+    )
+    return layer, 0
diff --git a/mart/models/yolo.py b/mart/models/yolo.py
new file mode 100644
index 00000000..34f4fdbc
--- /dev/null
+++ b/mart/models/yolo.py
@@ -0,0 +1,732 @@
+# https://raw.githubusercontent.com/pytorch/vision/ae30df455405fb56946425bf3f3c318280b0a7ae/torchvision/models/yolo.py
+from collections import OrderedDict
+from typing import List, Optional, Sequence, Tuple
+
+import torch
+from torch import nn, Tensor
+
+
+def _get_padding(kernel_size: int, stride: int) -> Tuple[int, nn.Module]:
+    """Returns the amount of padding needed by convolutional and max pooling layers.
+
+    Determines the amount of padding needed to make the output size of the layer the input size divided by the stride.
+    The first value that the function returns is the amount of padding to be added to all sides of the input matrix
+    (``padding`` argument of the operation). If an uneven amount of padding is needed in different sides of the input,
+    the second variable that is returned is an ``nn.ZeroPad2d`` operation that adds an additional column and row of
+    padding. If the input size is not divisible by the stride, the output size will be rounded upwards.
+
+    Args:
+        kernel_size: Size of the kernel.
+        stride: Stride of the operation.
+
+    Returns:
+        padding, pad_op: The amount of padding to be added to all sides of the input and an ``nn.Identity`` or
+        ``nn.ZeroPad2d`` operation to add one more column and row of padding if necessary.
+    """
+    # The output size is generally (input_size + padding - max(kernel_size, stride)) / stride + 1 and we want to
+    # make it equal to input_size / stride.
+    padding, remainder = divmod(max(kernel_size, stride) - stride, 2)
+
+    # If the kernel size is an even number, we need one cell of extra padding, on top of the padding added by MaxPool2d
+    # on both sides.
+    pad_op: nn.Module = nn.Identity() if remainder == 0 else nn.ZeroPad2d((0, 1, 0, 1))
+
+    return padding, pad_op
+
+
+def _create_activation_module(name: Optional[str]) -> nn.Module:
+    """Creates a layer activation module given its type as a string.
+
+    Args:
+        name: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic", "linear",
+            or "none".
+    """
+    if name == "relu":
+        return nn.ReLU(inplace=True)
+    if name == "leaky":
+        return nn.LeakyReLU(0.1, inplace=True)
+    if name == "mish":
+        return Mish()
+    if name == "silu" or name == "swish":
+        return nn.SiLU(inplace=True)
+    if name == "logistic":
+        return nn.Sigmoid()
+    if name == "linear" or name == "none" or name is None:
+        return nn.Identity()
+    raise ValueError(f"Activation type `{name}´ is unknown.")
+
+
+def _create_normalization_module(name: Optional[str], num_channels: int) -> nn.Module:
+    """Creates a layer normalization module given its type as a string.
+
+    Group normalization uses always 8 channels. The most common network widths are divisible by this number.
+
+    Args:
+        name: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+        num_channels: The number of input channels that the module expects.
+    """
+    if name == "batchnorm":
+        return nn.BatchNorm2d(num_channels, eps=0.001)
+    if name == "groupnorm":
+        return nn.GroupNorm(8, num_channels, eps=0.001)
+    if name == "none" or name is None:
+        return nn.Identity()
+    raise ValueError(f"Normalization layer type `{name}´ is unknown.")
+
+
+class Conv(nn.Module):
+    """A convolutional layer with optional layer normalization and activation.
+
+    If ``padding`` is ``None``, the module tries to add padding so much that the output size will be the input size
+    divided by the stride. If the input size is not divisible by the stride, the output size will be rounded upwards.
+
+    Args:
+        in_channels: Number of input channels that the layer expects.
+        out_channels: Number of output channels that the convolution produces.
+        kernel_size: Size of the convolving kernel.
+        stride: Stride of the convolution.
+        padding: Padding added to all four sides of the input.
+        bias: If ``True``, adds a learnable bias to the output.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        norm: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 1,
+        stride: int = 1,
+        padding: Optional[int] = None,
+        bias: bool = False,
+        activation: Optional[str] = "silu",
+        norm: Optional[str] = "batchnorm",
+    ):
+        super().__init__()
+
+        if padding is None:
+            padding, self.pad = _get_padding(kernel_size, stride)
+        else:
+            self.pad = nn.Identity()
+
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=bias)
+        self.norm = _create_normalization_module(norm, out_channels)
+        self.act = _create_activation_module(activation)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.pad(x)
+        x = self.conv(x)
+        x = self.norm(x)
+        return self.act(x)
+
+
+class MaxPool(nn.Module):
+    """A max pooling layer with padding.
+
+    The module tries to add padding so much that the output size will be the input size divided by the stride. If the
+    input size is not divisible by the stride, the output size will be rounded upwards.
+    """
+
+    def __init__(self, kernel_size: int, stride: int):
+        super().__init__()
+        padding, self.pad = _get_padding(kernel_size, stride)
+        self.maxpool = nn.MaxPool2d(kernel_size, stride, padding)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.pad(x)
+        return self.maxpool(x)
+
+
+class RouteLayer(nn.Module):
+    """A routing layer concatenates the output (or part of it) from given layers.
+
+    Args:
+        source_layers: Indices of the layers whose output will be concatenated.
+        num_chunks: Layer outputs will be split into this number of chunks.
+        chunk_idx: Only the chunks with this index will be concatenated.
+    """
+
+    def __init__(self, source_layers: List[int], num_chunks: int, chunk_idx: int) -> None:
+        super().__init__()
+        self.source_layers = source_layers
+        self.num_chunks = num_chunks
+        self.chunk_idx = chunk_idx
+
+    def forward(self, outputs: List[Tensor]) -> Tensor:
+        chunks = [torch.chunk(outputs[layer], self.num_chunks, dim=1)[self.chunk_idx] for layer in self.source_layers]
+        return torch.cat(chunks, dim=1)
+
+
+class ShortcutLayer(nn.Module):
+    """A shortcut layer adds a residual connection from the source layer.
+
+    Args:
+        source_layer: Index of the layer whose output will be added to the output of the previous layer.
+    """
+
+    def __init__(self, source_layer: int) -> None:
+        super().__init__()
+        self.source_layer = source_layer
+
+    def forward(self, outputs: List[Tensor]) -> Tensor:
+        return outputs[-1] + outputs[self.source_layer]
+
+
+class Mish(nn.Module):
+    """Mish activation."""
+
+    def forward(self, x: Tensor) -> Tensor:
+        return x * torch.tanh(nn.functional.softplus(x))
+
+
+class ReOrg(nn.Module):
+    """Re-organizes the tensor so that every square region of four cells is placed into four different channels.
+
+    The result is a tensor with half the width and height, and four times as many channels.
+    """
+
+    def forward(self, x: Tensor) -> Tensor:
+        tl = x[..., ::2, ::2]
+        bl = x[..., 1::2, ::2]
+        tr = x[..., ::2, 1::2]
+        br = x[..., 1::2, 1::2]
+        return torch.cat((tl, bl, tr, br), dim=1)
+
+
+class BottleneckBlock(nn.Module):
+    """A residual block with a bottleneck layer.
+
+    Args:
+        in_channels: Number of input channels that the block expects.
+        out_channels: Number of output channels that the block produces.
+        hidden_channels: Number of output channels the (hidden) bottleneck layer produces. By default the number of
+            output channels of the block.
+        shortcut: Whether the block should include a shortcut connection.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        norm: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        hidden_channels: Optional[int] = None,
+        shortcut: bool = True,
+        activation: Optional[str] = "silu",
+        norm: Optional[str] = "batchnorm",
+    ) -> None:
+        super().__init__()
+
+        if hidden_channels is None:
+            hidden_channels = out_channels
+
+        self.convs = nn.Sequential(
+            Conv(in_channels, hidden_channels, kernel_size=1, stride=1, activation=activation, norm=norm),
+            Conv(hidden_channels, out_channels, kernel_size=3, stride=1, activation=activation, norm=norm),
+        )
+        self.shortcut = shortcut and in_channels == out_channels
+
+    def forward(self, x: Tensor) -> Tensor:
+        y = self.convs(x)
+        return x + y if self.shortcut else y
+
+
+class TinyStage(nn.Module):
+    """One stage of the "tiny" network architecture from YOLOv4.
+
+    Args:
+        num_channels: Number of channels in the input of the stage. Partial output will have as many channels and full
+            output will have twice as many channels.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        norm: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+    """
+
+    def __init__(
+        self,
+        num_channels: int,
+        activation: Optional[str] = "leaky",
+        norm: Optional[str] = "batchnorm",
+    ) -> None:
+        super().__init__()
+
+        hidden_channels = num_channels // 2
+        self.conv1 = Conv(hidden_channels, hidden_channels, kernel_size=3, stride=1, activation=activation, norm=norm)
+        self.conv2 = Conv(hidden_channels, hidden_channels, kernel_size=3, stride=1, activation=activation, norm=norm)
+        self.mix = Conv(num_channels, num_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
+
+    def forward(self, x: Tensor) -> Tuple[Tensor, Tensor]:
+        partial = torch.chunk(x, 2, dim=1)[1]
+        y1 = self.conv1(partial)
+        y2 = self.conv2(y1)
+        partial_output = self.mix(torch.cat((y2, y1), dim=1))
+        full_output = torch.cat((x, partial_output), dim=1)
+        return partial_output, full_output
+
+
+class CSPStage(nn.Module):
+    """One stage of a Cross Stage Partial Network (CSPNet).
+
+    Encapsulates a number of bottleneck blocks in the "fusion first" CSP structure.
+
+    `Chien-Yao Wang et al. <https://arxiv.org/abs/1911.11929>`_
+
+    Args:
+        in_channels: Number of input channels that the CSP stage expects.
+        out_channels: Number of output channels that the CSP stage produces.
+        depth: Number of bottleneck blocks that the CSP stage contains.
+        shortcut: Whether the bottleneck blocks should include a shortcut connection.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        norm: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        depth: int = 1,
+        shortcut: bool = True,
+        activation: Optional[str] = "silu",
+        norm: Optional[str] = "batchnorm",
+    ) -> None:
+        super().__init__()
+
+        # Instead of splitting the N output channels of a convolution into two parts, we can equivalently perform two
+        # convolutions with N/2 output channels.
+        hidden_channels = out_channels // 2
+
+        self.split1 = Conv(in_channels, hidden_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
+        self.split2 = Conv(in_channels, hidden_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
+        bottlenecks: List[nn.Module] = [
+            BottleneckBlock(hidden_channels, hidden_channels, shortcut=shortcut, norm=norm, activation=activation)
+            for _ in range(depth)
+        ]
+        self.bottlenecks = nn.Sequential(*bottlenecks)
+        self.mix = Conv(hidden_channels * 2, out_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
+
+    def forward(self, x: Tensor) -> Tensor:
+        y1 = self.bottlenecks(self.split1(x))
+        y2 = self.split2(x)
+        return self.mix(torch.cat((y1, y2), dim=1))
+
+
+class ELANStage(nn.Module):
+    """One stage of an Efficient Layer Aggregation Network (ELAN).
+
+    `Chien-Yao Wang et al. <https://arxiv.org/abs/2211.04800>`_
+
+    Args:
+        in_channels: Number of input channels that the ELAN stage expects.
+        out_channels: Number of output channels that the ELAN stage produces.
+        hidden_channels: Number of output channels that the computational blocks produce. The default value is half the
+            number of output channels of the block, as in YOLOv7-W6, but the value varies between the variants.
+        split_channels: Number of channels in each part after splitting the input to the cross stage connection and the
+            computational blocks. The default value is the number of hidden channels, as in all YOLOv7 backbones. Most
+            YOLOv7 heads use twice the number of hidden channels.
+        depth: Number of computational blocks that the ELAN stage contains. The default value is 2. YOLOv7 backbones use
+            2 to 4 blocks per stage.
+        block_depth: Number of convolutional layers in one computational block. The default value is 2. YOLOv7 backbones
+            have two convolutions per block. YOLOv7 heads (except YOLOv7-X) have 2 to 8 blocks with only one convolution
+            in each.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        norm: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        hidden_channels: Optional[int] = None,
+        split_channels: Optional[int] = None,
+        depth: int = 2,
+        block_depth: int = 2,
+        activation: Optional[str] = "silu",
+        norm: Optional[str] = "batchnorm",
+    ) -> None:
+        super().__init__()
+
+        def conv3x3(in_channels: int, out_channels: int) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size=3, stride=1, activation=activation, norm=norm)
+
+        def block(in_channels: int, out_channels: int) -> nn.Module:
+            convs = [conv3x3(in_channels, out_channels)]
+            for _ in range(block_depth - 1):
+                convs.append(conv3x3(out_channels, out_channels))
+            return nn.Sequential(*convs)
+
+        # Instead of splitting the N output channels of a convolution into two parts, we can equivalently perform two
+        # convolutions with N/2 output channels. However, in many YOLOv7 architectures, the number of hidden channels is
+        # not exactly half the number of output channels.
+        if hidden_channels is None:
+            hidden_channels = out_channels // 2
+
+        if split_channels is None:
+            split_channels = hidden_channels
+
+        self.split1 = Conv(in_channels, split_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
+        self.split2 = Conv(in_channels, split_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
+
+        blocks = [block(split_channels, hidden_channels)]
+        for _ in range(depth - 1):
+            blocks.append(block(hidden_channels, hidden_channels))
+        self.blocks = nn.ModuleList(blocks)
+
+        total_channels = (split_channels * 2) + (hidden_channels * depth)
+        self.mix = Conv(total_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
+
+    def forward(self, x: Tensor) -> Tensor:
+        outputs = [self.split1(x), self.split2(x)]
+        x = outputs[-1]
+        for block in self.blocks:
+            x = block(x)
+            outputs.append(x)
+        return self.mix(torch.cat(outputs, dim=1))
+
+
+class CSPSPP(nn.Module):
+    """Spatial pyramid pooling module from the Cross Stage Partial Network from YOLOv4.
+
+    Args:
+        in_channels: Number of input channels that the module expects.
+        out_channels: Number of output channels that the module produces.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        norm: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        activation: Optional[str] = "silu",
+        norm: Optional[str] = "batchnorm",
+    ):
+        super().__init__()
+
+        def conv(in_channels: int, out_channels: int, kernel_size: int = 1) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size=kernel_size, stride=1, activation=activation, norm=norm)
+
+        self.conv1 = nn.Sequential(
+            conv(in_channels, out_channels),
+            conv(out_channels, out_channels, kernel_size=3),
+            conv(out_channels, out_channels),
+        )
+        self.conv2 = conv(in_channels, out_channels)
+
+        self.maxpool1 = MaxPool(kernel_size=5, stride=1)
+        self.maxpool2 = MaxPool(kernel_size=9, stride=1)
+        self.maxpool3 = MaxPool(kernel_size=13, stride=1)
+
+        self.mix1 = nn.Sequential(
+            conv(4 * out_channels, out_channels),
+            conv(out_channels, out_channels, kernel_size=3),
+        )
+        self.mix2 = Conv(2 * out_channels, out_channels)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x1 = self.conv1(x)
+        x2 = self.maxpool1(x1)
+        x3 = self.maxpool2(x1)
+        x4 = self.maxpool3(x1)
+        y1 = self.mix1(torch.cat((x1, x2, x3, x4), dim=1))
+        y2 = self.conv2(x)
+        return self.mix2(torch.cat((y1, y2), dim=1))
+
+
+class FastSPP(nn.Module):
+    """Fast spatial pyramid pooling module from YOLOv5.
+
+    Args:
+        in_channels: Number of input channels that the module expects.
+        out_channels: Number of output channels that the module produces.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        norm: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        activation: Optional[str] = "silu",
+        norm: Optional[str] = "batchnorm",
+    ):
+        super().__init__()
+        hidden_channels = in_channels // 2
+        self.conv = Conv(in_channels, hidden_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
+        self.maxpool = MaxPool(kernel_size=5, stride=1)
+        self.mix = Conv(hidden_channels * 4, out_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
+
+    def forward(self, x: Tensor) -> Tensor:
+        y1 = self.conv(x)
+        y2 = self.maxpool(y1)
+        y3 = self.maxpool(y2)
+        y4 = self.maxpool(y3)
+        return self.mix(torch.cat((y1, y2, y3, y4), dim=1))
+
+
+class YOLOV4TinyBackbone(nn.Module):
+    """Backbone of the "tiny" network architecture from YOLOv4.
+
+    Args:
+        in_channels: Number of channels in the input image.
+        width: Number of channels in the narrowest convolutional layer. The wider convolutional layers will use a number
+            of channels that is a multiple of this value.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        width: int = 32,
+        activation: Optional[str] = "leaky",
+        normalization: Optional[str] = "batchnorm",
+    ):
+        super().__init__()
+
+        def smooth(num_channels: int) -> nn.Module:
+            return Conv(num_channels, num_channels, kernel_size=3, stride=1, activation=activation, norm=normalization)
+
+        def downsample(in_channels: int, out_channels: int) -> nn.Module:
+            conv_module = Conv(
+                in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization
+            )
+            return nn.Sequential(OrderedDict([("downsample", conv_module), ("smooth", smooth(out_channels))]))
+
+        def maxpool(out_channels: int) -> nn.Module:
+            return nn.Sequential(
+                OrderedDict(
+                    [
+                        ("pad", nn.ZeroPad2d((0, 1, 0, 1))),
+                        ("maxpool", MaxPool(kernel_size=2, stride=2)),
+                        ("smooth", smooth(out_channels)),
+                    ]
+                )
+            )
+
+        def stage(out_channels: int, use_maxpool: bool) -> nn.Module:
+            if use_maxpool:
+                downsample_module = maxpool(out_channels)
+            else:
+                downsample_module = downsample(out_channels // 2, out_channels)
+            stage_module = TinyStage(out_channels, activation=activation, norm=normalization)
+            return nn.Sequential(OrderedDict([("downsample", downsample_module), ("stage", stage_module)]))
+
+        stages = [
+            Conv(in_channels, width, kernel_size=3, stride=2, activation=activation, norm=normalization),
+            stage(width * 2, False),
+            stage(width * 4, True),
+            stage(width * 8, True),
+            maxpool(width * 16),
+        ]
+        self.stages = nn.ModuleList(stages)
+
+    def forward(self, x: Tensor) -> List[Tensor]:
+        c1 = self.stages[0](x)
+        c2, x = self.stages[1](c1)
+        c3, x = self.stages[2](x)
+        c4, x = self.stages[3](x)
+        c5 = self.stages[4](x)
+        return [c1, c2, c3, c4, c5]
+
+
+class YOLOV4Backbone(nn.Module):
+    """A backbone that corresponds approximately to the Cross Stage Partial Network from YOLOv4.
+
+    Args:
+        in_channels: Number of channels in the input image.
+        widths: Number of channels at each network stage. Typically ``(32, 64, 128, 256, 512, 1024)``. The P6 variant
+            adds one more stage with 1024 channels.
+        depths: Number of bottleneck layers at each network stage. Typically ``(1, 1, 2, 8, 8, 4)``. The P6 variant uses
+            ``(1, 1, 3, 15, 15, 7, 7)``.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        widths: Sequence[int] = (32, 64, 128, 256, 512, 1024),
+        depths: Sequence[int] = (1, 1, 2, 8, 8, 4),
+        activation: Optional[str] = "silu",
+        normalization: Optional[str] = "batchnorm",
+    ) -> None:
+        super().__init__()
+
+        if len(widths) != len(depths):
+            raise ValueError("Width and depth has to be given for an equal number of stages.")
+
+        def conv3x3(in_channels: int, out_channels: int) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size=3, stride=1, activation=activation, norm=normalization)
+
+        def downsample(in_channels: int, out_channels: int) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
+
+        def stage(in_channels: int, out_channels: int, depth: int) -> nn.Module:
+            csp = CSPStage(
+                out_channels,
+                out_channels,
+                depth=depth,
+                shortcut=True,
+                activation=activation,
+                norm=normalization,
+            )
+            return nn.Sequential(
+                OrderedDict(
+                    [
+                        ("downsample", downsample(in_channels, out_channels)),
+                        ("csp", csp),
+                    ]
+                )
+            )
+
+        convs = [conv3x3(in_channels, widths[0])] + [conv3x3(widths[0], widths[0]) for _ in range(depths[0] - 1)]
+        self.stem = nn.Sequential(*convs)
+        self.stages = nn.ModuleList(
+            stage(in_channels, out_channels, depth)
+            for in_channels, out_channels, depth in zip(widths[:-1], widths[1:], depths[1:])
+        )
+
+    def forward(self, x: Tensor) -> List[Tensor]:
+        x = self.stem(x)
+        outputs: List[Tensor] = []
+        for stage in self.stages:
+            x = stage(x)
+            outputs.append(x)
+        return outputs
+
+
+class YOLOV5Backbone(nn.Module):
+    """The Cross Stage Partial Network backbone from YOLOv5.
+
+    Args:
+        in_channels: Number of channels in the input image.
+        width: Number of channels in the narrowest convolutional layer. The wider convolutional layers will use a number
+            of channels that is a multiple of this value. The values used by the different variants are 16 (yolov5n), 32
+            (yolov5s), 48 (yolov5m), 64 (yolov5l), and 80 (yolov5x).
+        depth: Repeat the bottleneck layers this many times. Can be used to make the network deeper. The values used by
+            the different variants are 1 (yolov5n, yolov5s), 2 (yolov5m), 3 (yolov5l), and 4 (yolov5x).
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        width: int = 64,
+        depth: int = 3,
+        activation: Optional[str] = "silu",
+        normalization: Optional[str] = "batchnorm",
+    ) -> None:
+        super().__init__()
+
+        def downsample(in_channels: int, out_channels: int, kernel_size: int = 3) -> nn.Module:
+            return Conv(
+                in_channels, out_channels, kernel_size=kernel_size, stride=2, activation=activation, norm=normalization
+            )
+
+        def stage(in_channels: int, out_channels: int, depth: int) -> nn.Module:
+            csp = CSPStage(
+                out_channels,
+                out_channels,
+                depth=depth,
+                shortcut=True,
+                activation=activation,
+                norm=normalization,
+            )
+            return nn.Sequential(
+                OrderedDict(
+                    [
+                        ("downsample", downsample(in_channels, out_channels)),
+                        ("csp", csp),
+                    ]
+                )
+            )
+
+        stages = [
+            downsample(in_channels, width, kernel_size=6),
+            stage(width, width * 2, depth),
+            stage(width * 2, width * 4, depth * 2),
+            stage(width * 4, width * 8, depth * 3),
+            stage(width * 8, width * 16, depth),
+        ]
+        self.stages = nn.ModuleList(stages)
+
+    def forward(self, x: Tensor) -> List[Tensor]:
+        c1 = self.stages[0](x)
+        c2 = self.stages[1](c1)
+        c3 = self.stages[2](c2)
+        c4 = self.stages[3](c3)
+        c5 = self.stages[4](c4)
+        return [c1, c2, c3, c4, c5]
+
+
+class YOLOV7Backbone(nn.Module):
+    """A backbone that corresponds to the W6 variant of the Efficient Layer Aggregation Network from YOLOv7.
+
+    Args:
+        in_channels: Number of channels in the input image.
+        widths: Number of channels at each network stage. Before the first stage there will be one extra split of
+            spatial resolution by a ``ReOrg`` layer, producing ``in_channels * 4`` channels.
+        depth: Number of computational blocks at each network stage. YOLOv7-W6 backbone uses 2.
+        block_depth: Number of convolutional layers in one computational block. YOLOv7-W6 backbone uses 2.
+        activation: Which layer activation to use. Can be "relu", "leaky", "mish", "silu" (or "swish"), "logistic",
+            "linear", or "none".
+        normalization: Which layer normalization to use. Can be "batchnorm", "groupnorm", or "none".
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        widths: Sequence[int] = (64, 128, 256, 512, 768, 1024),
+        depth: int = 2,
+        block_depth: int = 2,
+        activation: Optional[str] = "silu",
+        normalization: Optional[str] = "batchnorm",
+    ) -> None:
+        super().__init__()
+
+        def conv3x3(in_channels: int, out_channels: int) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size=3, stride=1, activation=activation, norm=normalization)
+
+        def downsample(in_channels: int, out_channels: int) -> nn.Module:
+            return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
+
+        def stage(in_channels: int, out_channels: int) -> nn.Module:
+            elan = ELANStage(
+                out_channels,
+                out_channels,
+                depth=depth,
+                block_depth=block_depth,
+                activation=activation,
+                norm=normalization,
+            )
+            return nn.Sequential(
+                OrderedDict(
+                    [
+                        ("downsample", downsample(in_channels, out_channels)),
+                        ("elan", elan),
+                    ]
+                )
+            )
+
+        self.stem = nn.Sequential(*[ReOrg(), conv3x3(in_channels * 4, widths[0])])
+        self.stages = nn.ModuleList(
+            stage(in_channels, out_channels) for in_channels, out_channels in zip(widths[:-1], widths[1:])
+        )
+
+    def forward(self, x: Tensor) -> List[Tensor]:
+        x = self.stem(x)
+        outputs: List[Tensor] = []
+        for stage in self.stages:
+            x = stage(x)
+            outputs.append(x)
+        return outputs

From f5235e1dd9f46eaa4d8be4746d8b1b37a8d3064a Mon Sep 17 00:00:00 2001
From: Cory Cornelius <cory.cornelius@intel.com>
Date: Fri, 23 Jun 2023 08:53:18 -0700
Subject: [PATCH 16/23] Fix imports

---
 mart/models/detection/__init__.py        |   7 -
 mart/models/detection/anchor_utils.py    | 269 +----------------------
 mart/models/detection/box_utils.py       |   2 +-
 mart/models/detection/target_matching.py |   2 +-
 mart/models/detection/yolo.py            |  12 +-
 mart/models/detection/yolo_networks.py   |   2 +-
 6 files changed, 10 insertions(+), 284 deletions(-)

diff --git a/mart/models/detection/__init__.py b/mart/models/detection/__init__.py
index 4bf1a515..7395b681 100644
--- a/mart/models/detection/__init__.py
+++ b/mart/models/detection/__init__.py
@@ -1,11 +1,4 @@
 # https://raw.githubusercontent.com/pytorch/vision/ae30df455405fb56946425bf3f3c318280b0a7ae/torchvision/models/detection/__init__.py
-from .faster_rcnn import *
-from .fcos import *
-from .keypoint_rcnn import *
-from .mask_rcnn import *
-from .retinanet import *
-from .ssd import *
-from .ssdlite import *
 from .yolo import YOLO, yolo_darknet, yolov4, YOLOV4_Backbone_Weights, YOLOV4_Weights
 from .yolo_networks import (
     DarknetNetwork,
diff --git a/mart/models/detection/anchor_utils.py b/mart/models/detection/anchor_utils.py
index 943071b0..8e89becd 100644
--- a/mart/models/detection/anchor_utils.py
+++ b/mart/models/detection/anchor_utils.py
@@ -1,273 +1,6 @@
 # https://raw.githubusercontent.com/pytorch/vision/ae30df455405fb56946425bf3f3c318280b0a7ae/torchvision/models/detection/anchor_utils.py
-
-import math
-from typing import List, Optional
-
 import torch
-from torch import nn, Tensor
-
-from .image_list import ImageList
-
-
-class AnchorGenerator(nn.Module):
-    """
-    Module that generates anchors for a set of feature maps and
-    image sizes.
-
-    The module support computing anchors at multiple sizes and aspect ratios
-    per feature map. This module assumes aspect ratio = height / width for
-    each anchor.
-
-    sizes and aspect_ratios should have the same number of elements, and it should
-    correspond to the number of feature maps.
-
-    sizes[i] and aspect_ratios[i] can have an arbitrary number of elements,
-    and AnchorGenerator will output a set of sizes[i] * aspect_ratios[i] anchors
-    per spatial location for feature map i.
-
-    Args:
-        sizes (Tuple[Tuple[int]]):
-        aspect_ratios (Tuple[Tuple[float]]):
-    """
-
-    __annotations__ = {
-        "cell_anchors": List[torch.Tensor],
-    }
-
-    def __init__(
-        self,
-        sizes=((128, 256, 512),),
-        aspect_ratios=((0.5, 1.0, 2.0),),
-    ):
-        super().__init__()
-
-        if not isinstance(sizes[0], (list, tuple)):
-            # TODO change this
-            sizes = tuple((s,) for s in sizes)
-        if not isinstance(aspect_ratios[0], (list, tuple)):
-            aspect_ratios = (aspect_ratios,) * len(sizes)
-
-        self.sizes = sizes
-        self.aspect_ratios = aspect_ratios
-        self.cell_anchors = [
-            self.generate_anchors(size, aspect_ratio) for size, aspect_ratio in zip(sizes, aspect_ratios)
-        ]
-
-    # TODO: https://github.com/pytorch/pytorch/issues/26792
-    # For every (aspect_ratios, scales) combination, output a zero-centered anchor with those values.
-    # (scales, aspect_ratios) are usually an element of zip(self.scales, self.aspect_ratios)
-    # This method assumes aspect ratio = height / width for an anchor.
-    def generate_anchors(
-        self,
-        scales: List[int],
-        aspect_ratios: List[float],
-        dtype: torch.dtype = torch.float32,
-        device: torch.device = torch.device("cpu"),
-    ) -> Tensor:
-        scales = torch.as_tensor(scales, dtype=dtype, device=device)
-        aspect_ratios = torch.as_tensor(aspect_ratios, dtype=dtype, device=device)
-        h_ratios = torch.sqrt(aspect_ratios)
-        w_ratios = 1 / h_ratios
-
-        ws = (w_ratios[:, None] * scales[None, :]).view(-1)
-        hs = (h_ratios[:, None] * scales[None, :]).view(-1)
-
-        base_anchors = torch.stack([-ws, -hs, ws, hs], dim=1) / 2
-        return base_anchors.round()
-
-    def set_cell_anchors(self, dtype: torch.dtype, device: torch.device):
-        self.cell_anchors = [cell_anchor.to(dtype=dtype, device=device) for cell_anchor in self.cell_anchors]
-
-    def num_anchors_per_location(self) -> List[int]:
-        return [len(s) * len(a) for s, a in zip(self.sizes, self.aspect_ratios)]
-
-    # For every combination of (a, (g, s), i) in (self.cell_anchors, zip(grid_sizes, strides), 0:2),
-    # output g[i] anchors that are s[i] distance apart in direction i, with the same dimensions as a.
-    def grid_anchors(self, grid_sizes: List[List[int]], strides: List[List[Tensor]]) -> List[Tensor]:
-        anchors = []
-        cell_anchors = self.cell_anchors
-        torch._assert(cell_anchors is not None, "cell_anchors should not be None")
-        torch._assert(
-            len(grid_sizes) == len(strides) == len(cell_anchors),
-            "Anchors should be Tuple[Tuple[int]] because each feature "
-            "map could potentially have different sizes and aspect ratios. "
-            "There needs to be a match between the number of "
-            "feature maps passed and the number of sizes / aspect ratios specified.",
-        )
-
-        for size, stride, base_anchors in zip(grid_sizes, strides, cell_anchors):
-            grid_height, grid_width = size
-            stride_height, stride_width = stride
-            device = base_anchors.device
-
-            # For output anchor, compute [x_center, y_center, x_center, y_center]
-            shifts_x = torch.arange(0, grid_width, dtype=torch.int32, device=device) * stride_width
-            shifts_y = torch.arange(0, grid_height, dtype=torch.int32, device=device) * stride_height
-            shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x, indexing="ij")
-            shift_x = shift_x.reshape(-1)
-            shift_y = shift_y.reshape(-1)
-            shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1)
-
-            # For every (base anchor, output anchor) pair,
-            # offset each zero-centered base anchor by the center of the output anchor.
-            anchors.append((shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4))
-
-        return anchors
-
-    def forward(self, image_list: ImageList, feature_maps: List[Tensor]) -> List[Tensor]:
-        grid_sizes = [feature_map.shape[-2:] for feature_map in feature_maps]
-        image_size = image_list.tensors.shape[-2:]
-        dtype, device = feature_maps[0].dtype, feature_maps[0].device
-        strides = [
-            [
-                torch.empty((), dtype=torch.int64, device=device).fill_(image_size[0] // g[0]),
-                torch.empty((), dtype=torch.int64, device=device).fill_(image_size[1] // g[1]),
-            ]
-            for g in grid_sizes
-        ]
-        self.set_cell_anchors(dtype, device)
-        anchors_over_all_feature_maps = self.grid_anchors(grid_sizes, strides)
-        anchors: List[List[torch.Tensor]] = []
-        for _ in range(len(image_list.image_sizes)):
-            anchors_in_image = [anchors_per_feature_map for anchors_per_feature_map in anchors_over_all_feature_maps]
-            anchors.append(anchors_in_image)
-        anchors = [torch.cat(anchors_per_image) for anchors_per_image in anchors]
-        return anchors
-
-
-class DefaultBoxGenerator(nn.Module):
-    """
-    This module generates the default boxes of SSD for a set of feature maps and image sizes.
-
-    Args:
-        aspect_ratios (List[List[int]]): A list with all the aspect ratios used in each feature map.
-        min_ratio (float): The minimum scale :math:`\text{s}_{\text{min}}` of the default boxes used in the estimation
-            of the scales of each feature map. It is used only if the ``scales`` parameter is not provided.
-        max_ratio (float): The maximum scale :math:`\text{s}_{\text{max}}`  of the default boxes used in the estimation
-            of the scales of each feature map. It is used only if the ``scales`` parameter is not provided.
-        scales (List[float]], optional): The scales of the default boxes. If not provided it will be estimated using
-            the ``min_ratio`` and ``max_ratio`` parameters.
-        steps (List[int]], optional): It's a hyper-parameter that affects the tiling of default boxes. If not provided
-            it will be estimated from the data.
-        clip (bool): Whether the standardized values of default boxes should be clipped between 0 and 1. The clipping
-            is applied while the boxes are encoded in format ``(cx, cy, w, h)``.
-    """
-
-    def __init__(
-        self,
-        aspect_ratios: List[List[int]],
-        min_ratio: float = 0.15,
-        max_ratio: float = 0.9,
-        scales: Optional[List[float]] = None,
-        steps: Optional[List[int]] = None,
-        clip: bool = True,
-    ):
-        super().__init__()
-        if steps is not None and len(aspect_ratios) != len(steps):
-            raise ValueError("aspect_ratios and steps should have the same length")
-        self.aspect_ratios = aspect_ratios
-        self.steps = steps
-        self.clip = clip
-        num_outputs = len(aspect_ratios)
-
-        # Estimation of default boxes scales
-        if scales is None:
-            if num_outputs > 1:
-                range_ratio = max_ratio - min_ratio
-                self.scales = [min_ratio + range_ratio * k / (num_outputs - 1.0) for k in range(num_outputs)]
-                self.scales.append(1.0)
-            else:
-                self.scales = [min_ratio, max_ratio]
-        else:
-            self.scales = scales
-
-        self._wh_pairs = self._generate_wh_pairs(num_outputs)
-
-    def _generate_wh_pairs(
-        self, num_outputs: int, dtype: torch.dtype = torch.float32, device: torch.device = torch.device("cpu")
-    ) -> List[Tensor]:
-        _wh_pairs: List[Tensor] = []
-        for k in range(num_outputs):
-            # Adding the 2 default width-height pairs for aspect ratio 1 and scale s'k
-            s_k = self.scales[k]
-            s_prime_k = math.sqrt(self.scales[k] * self.scales[k + 1])
-            wh_pairs = [[s_k, s_k], [s_prime_k, s_prime_k]]
-
-            # Adding 2 pairs for each aspect ratio of the feature map k
-            for ar in self.aspect_ratios[k]:
-                sq_ar = math.sqrt(ar)
-                w = self.scales[k] * sq_ar
-                h = self.scales[k] / sq_ar
-                wh_pairs.extend([[w, h], [h, w]])
-
-            _wh_pairs.append(torch.as_tensor(wh_pairs, dtype=dtype, device=device))
-        return _wh_pairs
-
-    def num_anchors_per_location(self) -> List[int]:
-        # Estimate num of anchors based on aspect ratios: 2 default boxes + 2 * ratios of feaure map.
-        return [2 + 2 * len(r) for r in self.aspect_ratios]
-
-    # Default Boxes calculation based on page 6 of SSD paper
-    def _grid_default_boxes(
-        self, grid_sizes: List[List[int]], image_size: List[int], dtype: torch.dtype = torch.float32
-    ) -> Tensor:
-        default_boxes = []
-        for k, f_k in enumerate(grid_sizes):
-            # Now add the default boxes for each width-height pair
-            if self.steps is not None:
-                x_f_k = image_size[1] / self.steps[k]
-                y_f_k = image_size[0] / self.steps[k]
-            else:
-                y_f_k, x_f_k = f_k
-
-            shifts_x = ((torch.arange(0, f_k[1]) + 0.5) / x_f_k).to(dtype=dtype)
-            shifts_y = ((torch.arange(0, f_k[0]) + 0.5) / y_f_k).to(dtype=dtype)
-            shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x, indexing="ij")
-            shift_x = shift_x.reshape(-1)
-            shift_y = shift_y.reshape(-1)
-
-            shifts = torch.stack((shift_x, shift_y) * len(self._wh_pairs[k]), dim=-1).reshape(-1, 2)
-            # Clipping the default boxes while the boxes are encoded in format (cx, cy, w, h)
-            _wh_pair = self._wh_pairs[k].clamp(min=0, max=1) if self.clip else self._wh_pairs[k]
-            wh_pairs = _wh_pair.repeat((f_k[0] * f_k[1]), 1)
-
-            default_box = torch.cat((shifts, wh_pairs), dim=1)
-
-            default_boxes.append(default_box)
-
-        return torch.cat(default_boxes, dim=0)
-
-    def __repr__(self) -> str:
-        s = (
-            f"{self.__class__.__name__}("
-            f"aspect_ratios={self.aspect_ratios}"
-            f", clip={self.clip}"
-            f", scales={self.scales}"
-            f", steps={self.steps}"
-            ")"
-        )
-        return s
-
-    def forward(self, image_list: ImageList, feature_maps: List[Tensor]) -> List[Tensor]:
-        grid_sizes = [feature_map.shape[-2:] for feature_map in feature_maps]
-        image_size = image_list.tensors.shape[-2:]
-        dtype, device = feature_maps[0].dtype, feature_maps[0].device
-        default_boxes = self._grid_default_boxes(grid_sizes, image_size, dtype=dtype)
-        default_boxes = default_boxes.to(device)
-
-        dboxes = []
-        x_y_size = torch.tensor([image_size[1], image_size[0]], device=default_boxes.device)
-        for _ in image_list.image_sizes:
-            dboxes_in_image = default_boxes
-            dboxes_in_image = torch.cat(
-                [
-                    (dboxes_in_image[:, :2] - 0.5 * dboxes_in_image[:, 2:]) * x_y_size,
-                    (dboxes_in_image[:, :2] + 0.5 * dboxes_in_image[:, 2:]) * x_y_size,
-                ],
-                -1,
-            )
-            dboxes.append(dboxes_in_image)
-        return dboxes
+from torch import Tensor
 
 
 def grid_offsets(grid_size: Tensor) -> Tensor:
diff --git a/mart/models/detection/box_utils.py b/mart/models/detection/box_utils.py
index 0813961b..d011374c 100644
--- a/mart/models/detection/box_utils.py
+++ b/mart/models/detection/box_utils.py
@@ -2,7 +2,7 @@
 import torch
 from torch import Tensor
 
-from ...ops import box_iou
+from torchvision.ops import box_iou
 
 
 def aligned_iou(wh1: Tensor, wh2: Tensor) -> Tensor:
diff --git a/mart/models/detection/target_matching.py b/mart/models/detection/target_matching.py
index 7e71457c..7f1d1c88 100644
--- a/mart/models/detection/target_matching.py
+++ b/mart/models/detection/target_matching.py
@@ -4,7 +4,7 @@
 import torch
 from torch import Tensor
 
-from ...ops import box_convert
+from torchvision.ops import box_convert
 from .anchor_utils import grid_centers
 from .box_utils import aligned_iou, box_size_ratio, iou_below, is_inside_box
 from .yolo_loss import YOLOLoss
diff --git a/mart/models/detection/yolo.py b/mart/models/detection/yolo.py
index d8f23732..b338f116 100644
--- a/mart/models/detection/yolo.py
+++ b/mart/models/detection/yolo.py
@@ -6,12 +6,12 @@
 import torch.nn as nn
 from torch import Tensor
 
-from ...ops import batched_nms
-from ...transforms import functional as F
-from .._api import register_model, Weights, WeightsEnum
-from .._utils import _ovewrite_value_param
-from ..yolo import YOLOV4Backbone
-from .backbone_utils import _validate_trainable_layers
+from torchvision.ops import batched_nms
+from torchvision.transforms import functional as F
+from torchvision.models._api import register_model, Weights, WeightsEnum
+from torchvision.models._utils import _ovewrite_value_param
+from mart.models.yolo import YOLOV4Backbone
+from torchvision.models.detection.backbone_utils import _validate_trainable_layers
 from .yolo_networks import DarknetNetwork, PRED, TARGET, TARGETS, YOLOV4Network
 
 IMAGES = List[Tensor]  # TorchScript doesn't allow a tuple.
diff --git a/mart/models/detection/yolo_networks.py b/mart/models/detection/yolo_networks.py
index 3ada7ae1..26224ca8 100644
--- a/mart/models/detection/yolo_networks.py
+++ b/mart/models/detection/yolo_networks.py
@@ -10,7 +10,7 @@
 import torch.nn as nn
 from torch import Tensor
 
-from ...ops import box_convert
+from torchvision.ops import box_convert
 from ..yolo import (
     Conv,
     CSPSPP,

From 0939b8387622201c8fe87dba8943e5ba58f3aaba Mon Sep 17 00:00:00 2001
From: Cory Cornelius <cory.cornelius@intel.com>
Date: Fri, 23 Jun 2023 11:35:00 -0700
Subject: [PATCH 17/23] Add support for calling different functions using
 dot-syntax in sequences

---
 mart/nn/nn.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/mart/nn/nn.py b/mart/nn/nn.py
index 48704315..b3661952 100644
--- a/mart/nn/nn.py
+++ b/mart/nn/nn.py
@@ -80,6 +80,12 @@ def parse_sequence(self, sequence):
                 # We can omit the key of _call_with_args_ if it is the only config.
                 module_cfg = {"_call_with_args_": module_cfg}
 
+            # Add support for calling different functions using dot-syntax
+            if "." not in module_name:
+                module_name = f"{module_name}.__call__"
+            module_name, _call_ = module_name.split(".", 1)
+            module_cfg["_call_"] = _call_
+
             # The return name could be different from module_name when a module is used more than once.
             return_name = module_cfg.pop("_name_", module_name)
             module = CallWith(self[module_name], **module_cfg)
@@ -124,7 +130,8 @@ def __call__(self, **kwargs):
 class CallWith:
     def __init__(
         self,
-        module: Callable,
+        module: object,
+        _call_: str | None = "__call__",
         _call_with_args_: Iterable[str] | None = None,
         _return_as_dict_: Iterable[str] | None = None,
         _train_mode_: bool | None = None,
@@ -134,6 +141,7 @@ def __init__(
         super().__init__()
 
         self.module = module
+        self.call_attr = _call_
         self.arg_keys = _call_with_args_
         self.kwarg_keys = kwarg_keys
         self.return_keys = _return_as_dict_
@@ -197,7 +205,8 @@ def __call__(
 
         with context:
             # FIXME: Add better error message
-            ret = self.module(*args, **kwargs)
+            func = getattr(self.module, self.call_attr)
+            ret = func(*args, **kwargs)
 
         if isinstance(self.module, torch.nn.Module):
             if _train_mode_ is not None:

From 363355eab3ca339718b31b90554739558a839702 Mon Sep 17 00:00:00 2001
From: Cory Cornelius <cory.cornelius@intel.com>
Date: Fri, 23 Jun 2023 11:52:56 -0700
Subject: [PATCH 18/23] Add trainable YOLO v3/v4 experiments

---
 mart/configs/datamodule/coco_yolo.yaml   |  24 +++
 mart/configs/experiment/COCO_YOLOv3.yaml |  35 +++
 mart/configs/experiment/COCO_YOLOv4.yaml |  35 +++
 mart/configs/model/yolo.yaml             |  71 +++++++
 mart/datamodules/coco.py                 |  13 ++
 mart/transforms/extended.py              | 260 ++++++++++++++++++++++-
 6 files changed, 435 insertions(+), 3 deletions(-)
 create mode 100644 mart/configs/datamodule/coco_yolo.yaml
 create mode 100644 mart/configs/experiment/COCO_YOLOv3.yaml
 create mode 100644 mart/configs/experiment/COCO_YOLOv4.yaml
 create mode 100644 mart/configs/model/yolo.yaml

diff --git a/mart/configs/datamodule/coco_yolo.yaml b/mart/configs/datamodule/coco_yolo.yaml
new file mode 100644
index 00000000..fc4c13cc
--- /dev/null
+++ b/mart/configs/datamodule/coco_yolo.yaml
@@ -0,0 +1,24 @@
+defaults:
+  - coco
+
+train_dataset:
+  transforms:
+    transforms:
+      - _target_: torchvision.transforms.ToTensor
+      - _target_: mart.transforms.ConvertCocoPolysToMask
+      - _target_: mart.transforms.PadToSquare
+        fill: 0.5
+      - _target_: mart.transforms.Resize
+        size: [416, 416]
+      - _target_: mart.transforms.RemapLabels
+      - _target_: mart.transforms.ConvertInstanceSegmentationToPerturbable
+
+val_dataset:
+  transforms: ${..train_dataset.transforms}
+
+test_dataset:
+  transforms: ${..val_dataset.transforms}
+
+collate_fn:
+  _target_: hydra.utils.get_method
+  path: mart.datamodules.coco.yolo_collate_fn
diff --git a/mart/configs/experiment/COCO_YOLOv3.yaml b/mart/configs/experiment/COCO_YOLOv3.yaml
new file mode 100644
index 00000000..24bd96f5
--- /dev/null
+++ b/mart/configs/experiment/COCO_YOLOv3.yaml
@@ -0,0 +1,35 @@
+# @package _global_
+
+defaults:
+  - override /datamodule: coco_yolo
+  - override /model: yolo
+  - override /optimization: super_convergence
+  - override /metric: average_precision
+
+task_name: "COCO_YOLOv3"
+tags: ["evaluation"]
+
+optimized_metric: "test_metrics/map"
+
+trainer:
+  # 117,266 training images, 6 epochs, batch_size=16, 43,974.75
+  max_steps: 43975
+  # FIXME: "nms_kernel" not implemented for 'BFloat16', torch.ops.torchvision.nms().
+  precision: 32
+
+datamodule:
+  num_workers: 4
+  ims_per_batch: 8
+
+model:
+  modules:
+    yolo:
+      config_path: ${paths.data_dir}/yolov3.cfg
+      weights_path: ${paths.data_dir}/yolov3.weights
+
+  optimizer:
+    lr: 0.001
+    momentum: 0.9
+    weight_decay: 0.0005
+
+  training_metrics: null
diff --git a/mart/configs/experiment/COCO_YOLOv4.yaml b/mart/configs/experiment/COCO_YOLOv4.yaml
new file mode 100644
index 00000000..7a215ab0
--- /dev/null
+++ b/mart/configs/experiment/COCO_YOLOv4.yaml
@@ -0,0 +1,35 @@
+# @package _global_
+
+defaults:
+  - override /datamodule: coco_yolo
+  - override /model: yolo
+  - override /optimization: super_convergence
+  - override /metric: average_precision
+
+task_name: "COCO_YOLOv4"
+tags: ["evaluation"]
+
+optimized_metric: "test_metrics/map"
+
+trainer:
+  # 117,266 training images, 6 epochs, batch_size=16, 43,974.75
+  max_steps: 43975
+  # FIXME: "nms_kernel" not implemented for 'BFloat16', torch.ops.torchvision.nms().
+  precision: 32
+
+datamodule:
+  num_workers: 4
+  ims_per_batch: 8
+
+model:
+  modules:
+    yolo:
+      config_path: ${paths.data_dir}/yolov4.cfg
+      weights_path: ${paths.data_dir}/yolov4.weights
+
+  optimizer:
+    lr: 0.001
+    momentum: 0.9
+    weight_decay: 0.0005
+
+  training_metrics: null
diff --git a/mart/configs/model/yolo.yaml b/mart/configs/model/yolo.yaml
new file mode 100644
index 00000000..f21cc6f8
--- /dev/null
+++ b/mart/configs/model/yolo.yaml
@@ -0,0 +1,71 @@
+defaults:
+  - modular
+
+modules:
+  yolo:
+    _target_: mart.models.detection.yolo.yolo_darknet
+    config_path: ???
+    weights_path: null
+
+  loss:
+    _target_: mart.nn.Sum
+
+  output:
+    _target_: mart.nn.ReturnKwargs
+
+# training metrics are too expensive
+training_metrics: null
+
+training_sequence:
+  seq010:
+    yolo:
+      images: "input"
+      _train_mode_: False
+      _inference_mode_: True
+
+  seq020:
+    yolo.process_detections:
+      _name_: "preds"
+      preds: "yolo"
+
+  seq030:
+    yolo:
+      _name_: "losses"
+      images: "input"
+      targets: "target"
+
+  seq100:
+    loss:
+      _call_with_args_:
+        - "losses.overlap"
+        - "losses.confidence"
+        - "losses.classification"
+
+validation_sequence:
+  seq010: ${..training_sequence.seq010}
+  seq020: ${..training_sequence.seq020}
+  seq030:
+    yolo:
+      _name_: "losses"
+      images: "input"
+      targets: "target"
+      _train_mode_: False
+  seq100: ${..training_sequence.seq100}
+
+test_sequence:
+  seq010: ${..validation_sequence.seq010}
+  seq020: ${..validation_sequence.seq020}
+
+training_step_log:
+  loss: "loss"
+  loss_overlap: "losses.overlap"
+  loss_confidence: "losses.confidence"
+  loss_classification: "losses.classification"
+
+validation_step_log:
+  loss: "loss"
+  loss_overlap: "losses.overlap"
+  loss_confidence: "losses.confidence"
+  loss_classification: "losses.classification"
+
+test_step_log: null
diff --git a/mart/datamodules/coco.py b/mart/datamodules/coco.py
index 42ddcebb..43b2e45b 100644
--- a/mart/datamodules/coco.py
+++ b/mart/datamodules/coco.py
@@ -7,6 +7,8 @@
 import os
 from typing import Any, Callable, List, Optional
 
+import torch
+from torch.utils.data import default_collate
 import numpy as np
 from torchvision.datasets.coco import CocoDetection as CocoDetection_
 from torchvision.datasets.folder import default_loader
@@ -44,6 +46,10 @@ def __init__(
 
         self.modalities = modalities
 
+        # Targets can contain a lot of information...
+        # https://discuss.pytorch.org/t/runtimeerror-received-0-items-of-ancdata/4999/4
+        torch.multiprocessing.set_sharing_strategy("file_system")
+
     def _load_image(self, id: int) -> Any:
         if self.modalities is None:
             return super()._load_image(id)
@@ -89,3 +95,10 @@ def __getitem__(self, index: int):
 # Source: https://github.com/pytorch/vision/blob/dc07ac2add8285e16a716564867d0b4b953f6735/references/detection/utils.py#L203
 def collate_fn(batch):
     return tuple(zip(*batch))
+
+def yolo_collate_fn(batch):
+    images, targets = collate_fn(batch)
+
+    images = default_collate(images)
+
+    return images, targets
diff --git a/mart/transforms/extended.py b/mart/transforms/extended.py
index 13cd0e74..1bc98933 100644
--- a/mart/transforms/extended.py
+++ b/mart/transforms/extended.py
@@ -4,6 +4,8 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 
+from __future__ import annotations
+
 import logging
 import os
 from typing import Dict, Optional, Tuple
@@ -29,6 +31,10 @@
     "ConvertInstanceSegmentationToPerturbable",
     "RandomHorizontalFlip",
     "ConvertCocoPolysToMask",
+    "PadToSquare",
+    "Resize",
+    "RemapLabels",
+    "CreatePerturbableMaskFromImage",
 ]
 
 
@@ -115,7 +121,7 @@ class ConvertInstanceSegmentationToPerturbable(ExTransform):
     """Merge all instance masks and reverse."""
 
     def __call__(self, image, target):
-        perturbable_mask = torch.sum(target["masks"], dim=0) == 0
+        perturbable_mask = torch.sum(target["masks"], dim=0, keepdim=True) == 0
         # Convert to float to be differentiable.
         target["perturbable_mask"] = perturbable_mask.float()
 
@@ -173,8 +179,8 @@ def flip_perturbable_mask(image, target):
         return image, target
 
     def forward(
-        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
-    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+        self, image: Tensor, target: dict[str, Tensor] | None = None
+    ) -> tuple[Tensor, dict[str, Tensor] | None]:
         if torch.rand(1) < self.p:
             image = F.hflip(image)
             if target is not None:
@@ -190,3 +196,251 @@ def forward(
 
 class ConvertCocoPolysToMask(ConvertCocoPolysToMask_, ExTransform):
     pass
+
+
+class PadToSquare(ExTransform):
+    def __init__(self, fill):
+        self.fill = fill
+
+    def __call__(
+        self,
+        image: Tensor,  # CHW
+        target: dict[str, Tensor] | None = None,
+    ):
+        w, h = F.get_image_size(image)
+
+        l_or_t = abs(h - w) // 2
+        r_or_b = abs(h - w) - l_or_t
+
+        # padding is  (left, top, right, bottom)
+        if h > w:
+            padding = (l_or_t, 0, r_or_b, 0)
+        else:
+            padding = (0, l_or_t, 0, r_or_b)
+
+        image = F.pad(image, padding, fill=self.fill)
+
+        if target is not None:
+            if "boxes" in target:
+                target["boxes"] = self.pad_boxes(target["boxes"], padding)
+            if "masks" in target:
+                target["masks"] = self.pad_masks(target["masks"], padding)
+            if "keypoints" in target:
+                target["keypoints"] = self.pad_keypoints(target["keypoints"], padding)
+            if "perturbable_mask" in target:
+                target["perturbable_mask"] = self.pad_masks(target["perturbable_mask"], padding)
+            if "gs_coords" in target:
+                target["gs_coords"] = self.pad_coordinates(target["gs_coords"], padding)
+
+        return image, target
+
+    def pad_boxes(self, boxes, padding):
+        boxes[:, 0] += padding[0]  # X + left
+        boxes[:, 1] += padding[1]  # Y + top
+        boxes[:, 2] += padding[0]  # X + left
+        boxes[:, 3] += padding[1]  # Y + top
+
+        return boxes
+
+    def pad_masks(self, masks, padding):
+        return F.pad(masks, padding, fill=0)
+
+    def pad_keypoints(self, keypoints, padding):
+        raise NotImplementedError
+
+    def pad_coordinates(self, coordinates, padding):
+        # coordinates are [[left, top], [right, top], [right, bottom], [left, bottom]]
+        # padding is [left, top, right bottom]
+        coordinates[:, 0] += padding[0]  # left padding
+        coordinates[:, 1] += padding[1]  # top padding
+
+        return coordinates
+
+
+class Resize(ExTransform):
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(
+        self,
+        image: Tensor,
+        target: dict[str, Tensor] | None = None,
+    ):
+        orig_w, orig_h = F.get_image_size(image)
+        image = F.resize(image, size=self.size)
+        new_w, new_h = F.get_image_size(image)
+
+        dw, dh = new_w / orig_w, new_h / orig_h
+
+        if target is not None:
+            if "boxes" in target:
+                target["boxes"] = self.resize_boxes(target["boxes"], (dw, dh))
+            if "masks" in target:
+                target["masks"] = self.resize_masks(target["masks"], (dw, dh))
+            if "keypoints" in target:
+                target["keypoints"] = self.resize_keypoints(target["keypoints"], (dw, dh))
+            if "perturbable_mask" in target:
+                target["perturbable_mask"] = self.resize_masks(
+                    target["perturbable_mask"], (dw, dh)
+                )
+            if "gs_coords" in target:
+                target["gs_coords"] = self.resize_coordinates(target["gs_coords"], (dw, dh))
+
+        return image, target
+
+    def resize_boxes(self, boxes, ratio):
+        boxes[:, 0] *= ratio[0]  # X1 * width ratio
+        boxes[:, 1] *= ratio[1]  # Y1 * height ratio
+        boxes[:, 2] *= ratio[0]  # X2 * width ratio
+        boxes[:, 3] *= ratio[1]  # Y2 * height ratio
+
+        return boxes
+
+    def resize_masks(self, masks, ratio):
+        assert len(masks.shape) == 3
+
+        # Resize fails on empty tensors
+        if masks.shape[0] == 0:
+            return torch.zeros((0, *self.size), dtype=masks.dtype, device=masks.device)
+
+        return F.resize(masks, size=self.size, interpolation=F.InterpolationMode.NEAREST)
+
+    def resize_keypoints(self, keypoints, ratio):
+        raise NotImplementedError
+
+    def resize_coordinates(self, coordinates, ratio):
+        # coordinates are [[left, top], [right, top], [right, bottom], [left, bottom]]
+        # ratio is [width, height]
+        coordinates[:, 0] = (coordinates[:, 0] * ratio[0]).to(int)  # width ratio
+        coordinates[:, 1] = (coordinates[:, 1] * ratio[1]).to(int)  # height ratio
+
+        return coordinates
+
+
+class RemapLabels(ExTransform):
+    COCO_MAP = {
+        1: 0,
+        2: 1,
+        3: 2,
+        4: 3,
+        5: 4,
+        6: 5,
+        7: 6,
+        8: 7,
+        9: 8,
+        10: 9,
+        11: 10,
+        13: 11,
+        14: 12,
+        15: 13,
+        16: 14,
+        17: 15,
+        18: 16,
+        19: 17,
+        20: 18,
+        21: 19,
+        22: 20,
+        23: 21,
+        24: 22,
+        25: 23,
+        27: 24,
+        28: 25,
+        31: 26,
+        32: 27,
+        33: 28,
+        34: 29,
+        35: 30,
+        36: 31,
+        37: 32,
+        38: 33,
+        39: 34,
+        40: 35,
+        41: 36,
+        42: 37,
+        43: 38,
+        44: 39,
+        46: 40,
+        47: 41,
+        48: 42,
+        49: 43,
+        50: 44,
+        51: 45,
+        52: 46,
+        53: 47,
+        54: 48,
+        55: 49,
+        56: 50,
+        57: 51,
+        58: 52,
+        59: 53,
+        60: 54,
+        61: 55,
+        62: 56,
+        63: 57,
+        64: 58,
+        65: 59,
+        67: 60,
+        70: 61,
+        72: 62,
+        73: 63,
+        74: 64,
+        75: 65,
+        76: 66,
+        77: 67,
+        78: 68,
+        79: 69,
+        80: 70,
+        81: 71,
+        82: 72,
+        84: 73,
+        85: 74,
+        86: 75,
+        87: 76,
+        88: 77,
+        89: 78,
+        90: 79,
+    }
+
+    def __init__(
+        self,
+        label_map: dict[int, int] | None = None,
+    ):
+        if label_map is None:
+            label_map = self.COCO_MAP
+
+        self.label_map = label_map
+
+    def __call__(
+        self,
+        image: Tensor,
+        target: dict[str, Tensor],
+    ):
+        labels = target["labels"]
+
+        # This is a terrible implementation
+        for i, label in enumerate(labels):
+            labels[i] = self.label_map[label.item()]
+
+        target["labels"] = labels
+
+        return image, target
+
+
+class CreatePerturbableMaskFromImage(ExTransform):
+    def __init__(self, chroma_key, threshold):
+        self.chroma_key = torch.tensor(chroma_key)
+        self.threshold = threshold
+
+    def __call__(
+        self,
+        image: Tensor,
+        target: dict[str, Tensor],
+    ):
+        self.chroma_key = self.chroma_key.to(image.device)
+
+        l2_dist = ((image - self.chroma_key[:, None, None]) ** 2).sum(dim=0, keepdim=True).sqrt()
+        perturbable_mask = l2_dist <= self.threshold
+
+        target["perturbable_mask"] = perturbable_mask.float()
+
+        return image, target

From 2f63ea9b59c7334953a6bd41aa4fc4549c8078d9 Mon Sep 17 00:00:00 2001
From: Cory Cornelius <cory.cornelius@intel.com>
Date: Fri, 23 Jun 2023 14:00:27 -0700
Subject: [PATCH 19/23] Add YOLO v3/v4 ShapeShifter experiments

---
 .../experiment/COCO_YOLOv3_ShapeShifter.yaml  | 158 ++++++++++++++++++
 .../experiment/COCO_YOLOv4_ShapeShifter.yaml  | 158 ++++++++++++++++++
 mart/configs/model/yolo.yaml                  |   7 +-
 mart/datamodules/coco.py                      |  15 +-
 mart/nn/nn.py                                 |  28 +++-
 5 files changed, 362 insertions(+), 4 deletions(-)
 create mode 100644 mart/configs/experiment/COCO_YOLOv3_ShapeShifter.yaml
 create mode 100644 mart/configs/experiment/COCO_YOLOv4_ShapeShifter.yaml

diff --git a/mart/configs/experiment/COCO_YOLOv3_ShapeShifter.yaml b/mart/configs/experiment/COCO_YOLOv3_ShapeShifter.yaml
new file mode 100644
index 00000000..ad294e42
--- /dev/null
+++ b/mart/configs/experiment/COCO_YOLOv3_ShapeShifter.yaml
@@ -0,0 +1,158 @@
+# @package _global_
+
+defaults:
+  - /attack/perturber@model.modules.perturbation: default
+  - /attack/perturber/initializer@model.modules.perturbation.initializer: uniform
+  - /attack/perturber/projector@model.modules.perturbation.projector: range
+  - /attack/composer@model.modules.input_adv: warp_composite
+  - /attack/gradient_modifier@model.gradient_modifier: lp_normalizer
+  - override /datamodule: coco_yolo
+  - override /model: yolo
+  - override /optimization: super_convergence
+  - override /metric: average_precision
+  - override /callbacks:
+      [
+        model_checkpoint,
+        lr_monitor,
+        perturbation_visualizer,
+        gradient_monitor,
+        attack_in_eval_mode,
+        no_grad_mode,
+      ]
+
+task_name: "COCO_YOLOv3_ShapeShifter"
+tags: ["adv"]
+
+optimized_metric: "test_metrics/map"
+
+trainer:
+  # 64115 training images, batch_size=8, FLOOR(64115/16) = 8014
+  max_steps: 80140 # 10 epochs
+  # mAP can be slow to compute so limit number of images
+  limit_val_batches: 100
+  precision: 32
+
+callbacks:
+  model_checkpoint:
+    monitor: "validation_metrics/map"
+    mode: "min"
+
+  attack_in_eval_mode:
+    module_classes:
+      - _target_: hydra.utils.get_class
+        path: torch.nn.BatchNorm2d
+
+  no_grad_mode:
+    module_names: "model.yolo"
+
+  perturbation_visualizer:
+    perturbation: "model.perturbation.perturbation"
+    frequency: 500
+
+datamodule:
+  num_workers: 4
+  ims_per_batch: 8
+
+  train_dataset:
+    annFile: ${paths.data_dir}/coco/annotations/person_instances_train2017.json
+  val_dataset:
+    annFile: ${paths.data_dir}/coco/annotations/person_instances_val2017.json
+  test_dataset:
+    annFile: ${paths.data_dir}/coco/annotations/person_instances_val2017.json
+
+model:
+  modules:
+    empty_targets:
+      _target_: mart.nn.EmptyTargets
+
+    yolo:
+      config_path: ${paths.data_dir}/yolov3.cfg
+      weights_path: ${paths.data_dir}/yolov3.weights
+
+    perturbation:
+      size: [3, 416, 234]
+
+      initializer:
+        min: 0.49
+        max: 0.51
+
+      projector:
+        min: 0.0
+        max: 1.0
+
+    total_variation:
+      _target_: mart.nn.TotalVariation
+
+    input_adv:
+      warp:
+        _target_: torchvision.transforms.Compose
+        transforms:
+          - _target_: mart.transforms.ColorJitter
+            brightness: [0.5, 1.5]
+            contrast: [0.5, 1.5]
+            saturation: [0.5, 1.0]
+            hue: [-0.05, 0.05]
+          - _target_: torchvision.transforms.RandomAffine
+            degrees: [-5, 5]
+            translate: [0.1, 0.25]
+            scale: [0.4, 0.6]
+            shear: [-3, 3, -3, 3]
+            interpolation: 2 # BILINEAR
+      clamp: [0, 1]
+
+  optimizer:
+    lr: 0.05
+    momentum: 0.9
+
+  lr_scheduler:
+    scheduler:
+      three_phase: true
+
+  gradient_modifier: null
+
+  training_sequence:
+    seq004:
+      empty_targets:
+        targets: "target.list_of_targets"
+    seq005: "perturbation"
+    seq006: "input_adv"
+    seq010:
+      yolo:
+        images: "input_adv"
+    seq030:
+      yolo:
+        images: "input_adv"
+        targets: "empty_targets"
+    seq050:
+      total_variation:
+        _call_with_args_:
+          - "perturbation"
+    seq100:
+      loss:
+        _call_with_args_:
+          - "losses.confidence"
+          - "total_variation"
+        weights:
+          - 1
+          - 0.0001
+
+  training_metrics: null
+  training_step_log:
+    total_variation: "total_variation"
+
+  validation_sequence:
+    seq004: ${..training_sequence.seq004}
+    seq005: ${..training_sequence.seq005}
+    seq006: ${..training_sequence.seq006}
+    seq010: ${..training_sequence.seq010}
+    seq030:
+      yolo:
+        images: "input_adv"
+        targets: "empty_targets"
+    seq050: ${..training_sequence.seq050}
+    seq100: ${..training_sequence.seq100}
+
+  test_sequence:
+    seq005: ${..training_sequence.seq005}
+    seq006: ${..training_sequence.seq006}
+    seq010: ${..training_sequence.seq010}
diff --git a/mart/configs/experiment/COCO_YOLOv4_ShapeShifter.yaml b/mart/configs/experiment/COCO_YOLOv4_ShapeShifter.yaml
new file mode 100644
index 00000000..a2dc74e8
--- /dev/null
+++ b/mart/configs/experiment/COCO_YOLOv4_ShapeShifter.yaml
@@ -0,0 +1,158 @@
+# @package _global_
+
+defaults:
+  - /attack/perturber@model.modules.perturbation: default
+  - /attack/perturber/initializer@model.modules.perturbation.initializer: uniform
+  - /attack/perturber/projector@model.modules.perturbation.projector: range
+  - /attack/composer@model.modules.input_adv: warp_composite
+  - /attack/gradient_modifier@model.gradient_modifier: lp_normalizer
+  - override /datamodule: coco_yolo
+  - override /model: yolo
+  - override /optimization: super_convergence
+  - override /metric: average_precision
+  - override /callbacks:
+      [
+        model_checkpoint,
+        lr_monitor,
+        perturbation_visualizer,
+        gradient_monitor,
+        attack_in_eval_mode,
+        no_grad_mode,
+      ]
+
+task_name: "COCO_YOLOv4_ShapeShifter"
+tags: ["adv"]
+
+optimized_metric: "test_metrics/map"
+
+trainer:
+  # 64115 training images, batch_size=8, FLOOR(64115/16) = 8014
+  max_steps: 80140 # 10 epochs
+  # mAP can be slow to compute so limit number of images
+  limit_val_batches: 100
+  precision: 32
+
+callbacks:
+  model_checkpoint:
+    monitor: "validation_metrics/map"
+    mode: "min"
+
+  attack_in_eval_mode:
+    module_classes:
+      - _target_: hydra.utils.get_class
+        path: torch.nn.BatchNorm2d
+
+  no_grad_mode:
+    module_names: "model.yolo"
+
+  perturbation_visualizer:
+    perturbation: "model.perturbation.perturbation"
+    frequency: 500
+
+datamodule:
+  num_workers: 4
+  ims_per_batch: 8
+
+  train_dataset:
+    annFile: ${paths.data_dir}/coco/annotations/person_instances_train2017.json
+  val_dataset:
+    annFile: ${paths.data_dir}/coco/annotations/person_instances_val2017.json
+  test_dataset:
+    annFile: ${paths.data_dir}/coco/annotations/person_instances_val2017.json
+
+model:
+  modules:
+    empty_targets:
+      _target_: mart.nn.EmptyTargets
+
+    yolo:
+      config_path: ${paths.data_dir}/yolov4.cfg
+      weights_path: ${paths.data_dir}/yolov4.weights
+
+    perturbation:
+      size: [3, 416, 234]
+
+      initializer:
+        min: 0.49
+        max: 0.51
+
+      projector:
+        min: 0.0
+        max: 1.0
+
+    total_variation:
+      _target_: mart.nn.TotalVariation
+
+    input_adv:
+      warp:
+        _target_: torchvision.transforms.Compose
+        transforms:
+          - _target_: mart.transforms.ColorJitter
+            brightness: [0.5, 1.5]
+            contrast: [0.5, 1.5]
+            saturation: [0.5, 1.0]
+            hue: [-0.05, 0.05]
+          - _target_: torchvision.transforms.RandomAffine
+            degrees: [-5, 5]
+            translate: [0.1, 0.25]
+            scale: [0.4, 0.6]
+            shear: [-3, 3, -3, 3]
+            interpolation: 2 # BILINEAR
+      clamp: [0, 1]
+
+  optimizer:
+    lr: 0.05
+    momentum: 0.9
+
+  lr_scheduler:
+    scheduler:
+      three_phase: true
+
+  gradient_modifier: null
+
+  training_sequence:
+    seq004:
+      empty_targets:
+        targets: "target.list_of_targets"
+    seq005: "perturbation"
+    seq006: "input_adv"
+    seq010:
+      yolo:
+        images: "input_adv"
+    seq030:
+      yolo:
+        images: "input_adv"
+        targets: "empty_targets"
+    seq050:
+      total_variation:
+        _call_with_args_:
+          - "perturbation"
+    seq100:
+      loss:
+        _call_with_args_:
+          - "losses.confidence"
+          - "total_variation"
+        weights:
+          - 1
+          - 0.0001
+
+  training_metrics: null
+  training_step_log:
+    total_variation: "total_variation"
+
+  validation_sequence:
+    seq004: ${..training_sequence.seq004}
+    seq005: ${..training_sequence.seq005}
+    seq006: ${..training_sequence.seq006}
+    seq010: ${..training_sequence.seq010}
+    seq030:
+      yolo:
+        images: "input_adv"
+        targets: "empty_targets"
+    seq050: ${..training_sequence.seq050}
+    seq100: ${..training_sequence.seq100}
+
+  test_sequence:
+    seq005: ${..training_sequence.seq005}
+    seq006: ${..training_sequence.seq006}
+    seq010: ${..training_sequence.seq010}
diff --git a/mart/configs/model/yolo.yaml b/mart/configs/model/yolo.yaml
index f21cc6f8..83818ed7 100644
--- a/mart/configs/model/yolo.yaml
+++ b/mart/configs/model/yolo.yaml
@@ -1,6 +1,9 @@
 defaults:
   - modular
 
+# FIXME: it would be nice to not do this...
+output_target_key: "target.list_of_targets"
+
 modules:
   yolo:
     _target_: mart.models.detection.yolo.yolo_darknet
@@ -32,7 +35,7 @@ training_sequence:
     yolo:
       _name_: "losses"
       images: "input"
-      targets: "target"
+      targets: "target.list_of_targets"
 
   seq100:
     loss:
@@ -48,7 +51,7 @@ validation_sequence:
     yolo:
       _name_: "losses"
       images: "input"
-      targets: "target"
+      targets: "target.list_of_targets"
       _train_mode_: False
   seq100: ${..training_sequence.seq100}
 
diff --git a/mart/datamodules/coco.py b/mart/datamodules/coco.py
index 43b2e45b..746e33ed 100644
--- a/mart/datamodules/coco.py
+++ b/mart/datamodules/coco.py
@@ -96,9 +96,22 @@ def __getitem__(self, index: int):
 def collate_fn(batch):
     return tuple(zip(*batch))
 
+
 def yolo_collate_fn(batch):
     images, targets = collate_fn(batch)
 
+    # Collate images
     images = default_collate(images)
 
-    return images, targets
+    # Turn tuple of dicts into dict of tuples
+    new_targets = {k: tuple(t[k] for t in targets) for k in targets[0].keys()}
+    new_targets["list_of_targets"] = targets
+
+    # Collate targets
+    COLLATABLE_KEYS = ["perturbable_mask"]
+
+    for key in new_targets.keys():
+        if key in COLLATABLE_KEYS:
+            new_targets[key] = default_collate(new_targets[key])
+
+    return images, new_targets
diff --git a/mart/nn/nn.py b/mart/nn/nn.py
index b3661952..9b3310ec 100644
--- a/mart/nn/nn.py
+++ b/mart/nn/nn.py
@@ -13,7 +13,7 @@
 
 import torch
 
-__all__ = ["GroupNorm32", "SequentialDict", "ReturnKwargs", "CallWith", "Sum"]
+__all__ = ["GroupNorm32", "SequentialDict", "ReturnKwargs", "CallWith", "Sum", "TotalVariation", "EmptyTargets"]
 
 logger = logging.getLogger(__name__)
 
@@ -300,3 +300,29 @@ def forward(self, *values, weights=None):
 
         assert len(weights) == len(values)
         return sum(value * weight for value, weight in zip(values, weights))
+
+
+# FIXME: This must exist already?!
+class TotalVariation(Sum):
+    def forward(self, *values, weights=None):
+        values = [self._total_variation(value) for value in values]
+
+        return super().forward(*values, weights=weights)
+
+    def _total_variation(self, image):
+        return torch.mean(
+            torch.sum(torch.square(image[:, 1:, :] - image[:, :-1, :]))
+            + torch.sum(  # noqa: W503
+                torch.square(image[:, :, 1:] - image[:, :, :-1])
+            )
+        )
+
+
+class EmptyTargets(torch.nn.Module):
+    def forward(self, targets):
+        return [
+            {
+                "boxes": torch.empty((0, 4), device=t["boxes"].device),
+                "labels": torch.empty(0, dtype=torch.int64, device=t["labels"].device),
+            } for t in targets
+        ]

From 45a06da4c4199945fa0cd4543a769e438e71f167 Mon Sep 17 00:00:00 2001
From: Cory Cornelius <cory.cornelius@intel.com>
Date: Fri, 23 Jun 2023 14:11:46 -0700
Subject: [PATCH 20/23] style

---
 mart/datamodules/coco.py                 |   2 +-
 mart/models/detection/__init__.py        |   2 +-
 mart/models/detection/anchor_utils.py    |   3 +-
 mart/models/detection/box_utils.py       |   9 +-
 mart/models/detection/target_matching.py |  42 ++-
 mart/models/detection/yolo.py            |  75 +++--
 mart/models/detection/yolo_loss.py       |  38 ++-
 mart/models/detection/yolo_networks.py   | 343 ++++++++++++++++++-----
 mart/models/yolo.py                      | 209 +++++++++++---
 mart/nn/nn.py                            |  29 +-
 10 files changed, 577 insertions(+), 175 deletions(-)

diff --git a/mart/datamodules/coco.py b/mart/datamodules/coco.py
index 746e33ed..167bb49b 100644
--- a/mart/datamodules/coco.py
+++ b/mart/datamodules/coco.py
@@ -7,9 +7,9 @@
 import os
 from typing import Any, Callable, List, Optional
 
+import numpy as np
 import torch
 from torch.utils.data import default_collate
-import numpy as np
 from torchvision.datasets.coco import CocoDetection as CocoDetection_
 from torchvision.datasets.folder import default_loader
 
diff --git a/mart/models/detection/__init__.py b/mart/models/detection/__init__.py
index 7395b681..c4164cd7 100644
--- a/mart/models/detection/__init__.py
+++ b/mart/models/detection/__init__.py
@@ -1,5 +1,5 @@
 # https://raw.githubusercontent.com/pytorch/vision/ae30df455405fb56946425bf3f3c318280b0a7ae/torchvision/models/detection/__init__.py
-from .yolo import YOLO, yolo_darknet, yolov4, YOLOV4_Backbone_Weights, YOLOV4_Weights
+from .yolo import YOLO, YOLOV4_Backbone_Weights, YOLOV4_Weights, yolo_darknet, yolov4
 from .yolo_networks import (
     DarknetNetwork,
     YOLOV4Network,
diff --git a/mart/models/detection/anchor_utils.py b/mart/models/detection/anchor_utils.py
index 8e89becd..d5e0b6a6 100644
--- a/mart/models/detection/anchor_utils.py
+++ b/mart/models/detection/anchor_utils.py
@@ -29,7 +29,8 @@ def grid_centers(grid_size: Tensor) -> Tensor:
 
 @torch.jit.script
 def global_xy(xy: Tensor, image_size: Tensor) -> Tensor:
-    """Adds offsets to the predicted box center coordinates to obtain global coordinates to the image.
+    """Adds offsets to the predicted box center coordinates to obtain global coordinates to the
+    image.
 
     The predicted coordinates are interpreted as coordinates inside a grid cell whose width and height is 1. Adding
     offset to the cell, dividing by the grid size, and multiplying by the image size, we get global coordinates in the
diff --git a/mart/models/detection/box_utils.py b/mart/models/detection/box_utils.py
index d011374c..01f75514 100644
--- a/mart/models/detection/box_utils.py
+++ b/mart/models/detection/box_utils.py
@@ -1,13 +1,12 @@
 # https://raw.githubusercontent.com/pytorch/vision/ae30df455405fb56946425bf3f3c318280b0a7ae/torchvision/models/detection/box_utils.py
 import torch
 from torch import Tensor
-
 from torchvision.ops import box_iou
 
 
 def aligned_iou(wh1: Tensor, wh2: Tensor) -> Tensor:
-    """Calculates a matrix of intersections over union from box dimensions, assuming that the boxes are located at
-    the same coordinates.
+    """Calculates a matrix of intersections over union from box dimensions, assuming that the boxes
+    are located at the same coordinates.
 
     Args:
         wh1: An ``[N, 2]`` matrix of box shapes (width and height).
@@ -27,8 +26,8 @@ def aligned_iou(wh1: Tensor, wh2: Tensor) -> Tensor:
 
 
 def iou_below(pred_boxes: Tensor, target_boxes: Tensor, threshold: float) -> Tensor:
-    """Creates a binary mask whose value will be ``True``, unless the predicted box overlaps any target
-    significantly (IoU greater than ``threshold``).
+    """Creates a binary mask whose value will be ``True``, unless the predicted box overlaps any
+    target significantly (IoU greater than ``threshold``).
 
     Args:
         pred_boxes: The predicted corner coordinates. Tensor of size ``[height, width, boxes_per_cell, 4]``.
diff --git a/mart/models/detection/target_matching.py b/mart/models/detection/target_matching.py
index 7f1d1c88..f755ea5a 100644
--- a/mart/models/detection/target_matching.py
+++ b/mart/models/detection/target_matching.py
@@ -3,8 +3,8 @@
 
 import torch
 from torch import Tensor
-
 from torchvision.ops import box_convert
+
 from .anchor_utils import grid_centers
 from .box_utils import aligned_iou, box_size_ratio, iou_below, is_inside_box
 from .yolo_loss import YOLOLoss
@@ -12,7 +12,9 @@
 PRIOR_SHAPES = List[List[int]]  # TorchScript doesn't allow a list of tuples.
 
 
-def target_boxes_to_grid(preds: Tensor, targets: Tensor, image_size: Tensor) -> Tuple[Tensor, Tensor]:
+def target_boxes_to_grid(
+    preds: Tensor, targets: Tensor, image_size: Tensor
+) -> Tuple[Tensor, Tensor]:
     """Scales target bounding boxes to feature map coordinates.
 
     It would be better to implement this in a super class, but TorchScript doesn't allow class inheritance.
@@ -59,18 +61,23 @@ class HighestIoUMatching:
     """
 
     def __init__(
-        self, prior_shapes: PRIOR_SHAPES, prior_shape_idxs: List[int], ignore_bg_threshold: float = 0.7
+        self,
+        prior_shapes: PRIOR_SHAPES,
+        prior_shape_idxs: List[int],
+        ignore_bg_threshold: float = 0.7,
     ) -> None:
         self.prior_shapes = prior_shapes
         # anchor_map maps the anchor indices to anchors in this layer, or to -1 if it's not an anchor of this layer.
         # This layer ignores the target if all the selected anchors are in another layer.
         self.anchor_map = [
-            prior_shape_idxs.index(idx) if idx in prior_shape_idxs else -1 for idx in range(len(prior_shapes))
+            prior_shape_idxs.index(idx) if idx in prior_shape_idxs else -1
+            for idx in range(len(prior_shapes))
         ]
         self.ignore_bg_threshold = ignore_bg_threshold
 
     def match(self, wh: Tensor) -> Tuple[Tensor, Tensor]:
-        """Selects anchors for each target based on the predicted shapes. The subclasses implement this method.
+        """Selects anchors for each target based on the predicted shapes. The subclasses implement
+        this method.
 
         Args:
             wh: A matrix of predicted width and height values.
@@ -95,7 +102,8 @@ def __call__(
         targets: Dict[str, Tensor],
         image_size: Tensor,
     ) -> Tuple[List[Tensor], Tensor, Tensor]:
-        """For each target, selects predictions from the same grid cell, where the center of the target box is.
+        """For each target, selects predictions from the same grid cell, where the center of the
+        target box is.
 
         Typically there are three predictions per grid cell. Subclasses implement ``match()``, which selects the
         predictions within the grid cell.
@@ -131,7 +139,7 @@ class IoUThresholdMatching:
             network input resolution.
         prior_shape_idxs: List of indices to ``prior_shapes`` that is used to select the (usually 3) prior shapes that
             this layer uses.
-        threshold: IoU treshold for matching.
+        threshold: IoU threshold for matching.
         ignore_bg_threshold: If a predictor is not responsible for predicting any target, but the corresponding anchor
             has IoU with some target greater than this threshold, the predictor will not be taken into account when
             calculating the confidence loss.
@@ -149,7 +157,8 @@ def __init__(
         self.ignore_bg_threshold = ignore_bg_threshold
 
     def match(self, wh: Tensor) -> Tuple[Tensor, Tensor]:
-        """Selects anchors for each target based on the predicted shapes. The subclasses implement this method.
+        """Selects anchors for each target based on the predicted shapes. The subclasses implement
+        this method.
 
         Args:
             wh: A matrix of predicted width and height values.
@@ -170,7 +179,8 @@ def __call__(
         targets: Dict[str, Tensor],
         image_size: Tensor,
     ) -> Tuple[List[Tensor], Tensor, Tensor]:
-        """For each target, selects predictions from the same grid cell, where the center of the target box is.
+        """For each target, selects predictions from the same grid cell, where the center of the
+        target box is.
 
         Typically there are three predictions per grid cell. Subclasses implement ``match()``, which selects the
         predictions within the grid cell.
@@ -199,8 +209,8 @@ def __call__(
 
 
 class SizeRatioMatching:
-    """For each target, select those prior shapes, whose width and height relative to the target is below given
-    ratio.
+    """For each target, select those prior shapes, whose width and height relative to the target is
+    below given ratio.
 
     This is the matching rule used by Ultralytics YOLOv5 implementation.
 
@@ -227,7 +237,8 @@ def __init__(
         self.ignore_bg_threshold = ignore_bg_threshold
 
     def match(self, wh: Tensor) -> Tuple[Tensor, Tensor]:
-        """Selects anchors for each target based on the predicted shapes. The subclasses implement this method.
+        """Selects anchors for each target based on the predicted shapes. The subclasses implement
+        this method.
 
         Args:
             wh: A matrix of predicted width and height values.
@@ -246,7 +257,8 @@ def __call__(
         targets: Dict[str, Tensor],
         image_size: Tensor,
     ) -> Tuple[List[Tensor], Tensor, Tensor]:
-        """For each target, selects predictions from the same grid cell, where the center of the target box is.
+        """For each target, selects predictions from the same grid cell, where the center of the
+        target box is.
 
         Typically there are three predictions per grid cell. Subclasses implement ``match()``, which selects the
         predictions within the grid cell.
@@ -365,7 +377,9 @@ def __call__(
             matched targets. The last tensor contains as many elements as there are ``True`` values in the first mask.
         """
         height, width, boxes_per_cell, _ = preds["boxes"].shape
-        prior_mask, anchor_inside_target = self._get_prior_mask(targets, image_size, width, height, boxes_per_cell)
+        prior_mask, anchor_inside_target = self._get_prior_mask(
+            targets, image_size, width, height, boxes_per_cell
+        )
         prior_preds = {
             "boxes": preds["boxes"][prior_mask],
             "confidences": preds["confidences"][prior_mask],
diff --git a/mart/models/detection/yolo.py b/mart/models/detection/yolo.py
index b338f116..ba580616 100644
--- a/mart/models/detection/yolo.py
+++ b/mart/models/detection/yolo.py
@@ -5,21 +5,22 @@
 import torch
 import torch.nn as nn
 from torch import Tensor
-
+from torchvision.models._api import Weights, WeightsEnum, register_model
+from torchvision.models._utils import _ovewrite_value_param
+from torchvision.models.detection.backbone_utils import _validate_trainable_layers
 from torchvision.ops import batched_nms
 from torchvision.transforms import functional as F
-from torchvision.models._api import register_model, Weights, WeightsEnum
-from torchvision.models._utils import _ovewrite_value_param
+
 from mart.models.yolo import YOLOV4Backbone
-from torchvision.models.detection.backbone_utils import _validate_trainable_layers
-from .yolo_networks import DarknetNetwork, PRED, TARGET, TARGETS, YOLOV4Network
+
+from .yolo_networks import PRED, TARGET, TARGETS, DarknetNetwork, YOLOV4Network
 
 IMAGES = List[Tensor]  # TorchScript doesn't allow a tuple.
 
 
 class YOLO(nn.Module):
-    """YOLO implementation that supports the most important features of YOLOv3, YOLOv4, YOLOv5, YOLOv7, Scaled-
-    YOLOv4, and YOLOX.
+    """YOLO implementation that supports the most important features of YOLOv3, YOLOv4, YOLOv5,
+    YOLOv7, Scaled- YOLOv4, and YOLOX.
 
     *YOLOv3 paper*: `Joseph Redmon and Ali Farhadi <https://arxiv.org/abs/1804.02767>`__
 
@@ -113,8 +114,8 @@ def __init__(
     def forward(
         self, images: Union[Tensor, IMAGES], targets: Optional[TARGETS] = None
     ) -> Union[Tensor, Dict[str, Tensor]]:
-        """Runs a forward pass through the network (all layers listed in ``self.network``), and if training targets
-        are provided, computes the losses from the detection layers.
+        """Runs a forward pass through the network (all layers listed in ``self.network``), and if
+        training targets are provided, computes the losses from the detection layers.
 
         Detections are concatenated from the detection layers. Each detection layer will produce a number of detections
         that depends on the size of the feature map and the number of anchors per feature map cell.
@@ -144,8 +145,8 @@ def forward(
         return {"overlap": losses[0], "confidence": losses[1], "classification": losses[2]}
 
     def infer(self, image: Tensor) -> PRED:
-        """Feeds an image to the network and returns the detected bounding boxes, confidence scores, and class
-        labels.
+        """Feeds an image to the network and returns the detected bounding boxes, confidence
+        scores, and class labels.
 
         If a prediction has a high score for more than one class, it will be duplicated.
 
@@ -172,9 +173,9 @@ def infer(self, image: Tensor) -> PRED:
         return detections
 
     def process_detections(self, preds: Tensor) -> List[PRED]:
-        """Splits the detection tensor returned by a forward pass into a list of prediction dictionaries, and
-        filters them based on confidence threshold, non-maximum suppression (NMS), and maximum number of
-        predictions.
+        """Splits the detection tensor returned by a forward pass into a list of prediction
+        dictionaries, and filters them based on confidence threshold, non-maximum suppression
+        (NMS), and maximum number of predictions.
 
         If for any single detection there are multiple categories whose score is above the confidence threshold, the
         detection will be duplicated to create one detection for each category. NMS processes one category at a time,
@@ -237,15 +238,21 @@ def validate_batch(self, images: Union[Tensor, IMAGES], targets: Optional[TARGET
         """
         if not isinstance(images, Tensor):
             if not isinstance(images, (tuple, list)):
-                raise TypeError(f"Expected images to be a Tensor, tuple, or a list, got {type(images).__name__}.")
+                raise TypeError(
+                    f"Expected images to be a Tensor, tuple, or a list, got {type(images).__name__}."
+                )
             if not images:
                 raise ValueError("No images in batch.")
             shape = images[0].shape
             for image in images:
                 if not isinstance(image, Tensor):
-                    raise ValueError(f"Expected image to be of type Tensor, got {type(image).__name__}.")
+                    raise ValueError(
+                        f"Expected image to be of type Tensor, got {type(image).__name__}."
+                    )
                 if image.shape != shape:
-                    raise ValueError(f"Images with different shapes in one batch: {shape} and {image.shape}")
+                    raise ValueError(
+                        f"Images with different shapes in one batch: {shape} and {image.shape}"
+                    )
 
         if targets is None:
             if self.training:
@@ -254,7 +261,9 @@ def validate_batch(self, images: Union[Tensor, IMAGES], targets: Optional[TARGET
                 return
 
         if not isinstance(targets, (tuple, list)):
-            raise TypeError(f"Expected targets to be a tuple or a list, got {type(images).__name__}.")
+            raise TypeError(
+                f"Expected targets to be a tuple or a list, got {type(images).__name__}."
+            )
         if len(images) != len(targets):
             raise ValueError(f"Got {len(images)} images, but targets for {len(targets)} images.")
 
@@ -263,14 +272,20 @@ def validate_batch(self, images: Union[Tensor, IMAGES], targets: Optional[TARGET
                 raise ValueError("Target dictionary doesn't contain boxes.")
             boxes = target["boxes"]
             if not isinstance(boxes, Tensor):
-                raise TypeError(f"Expected target boxes to be of type Tensor, got {type(boxes).__name__}.")
+                raise TypeError(
+                    f"Expected target boxes to be of type Tensor, got {type(boxes).__name__}."
+                )
             if (boxes.ndim != 2) or (boxes.shape[-1] != 4):
-                raise ValueError(f"Expected target boxes to be tensors of shape [N, 4], got {list(boxes.shape)}.")
+                raise ValueError(
+                    f"Expected target boxes to be tensors of shape [N, 4], got {list(boxes.shape)}."
+                )
             if "labels" not in target:
                 raise ValueError("Target dictionary doesn't contain labels.")
             labels = target["labels"]
             if not isinstance(labels, Tensor):
-                raise ValueError(f"Expected target labels to be of type Tensor, got {type(labels).__name__}.")
+                raise ValueError(
+                    f"Expected target labels to be of type Tensor, got {type(labels).__name__}."
+                )
             if (labels.ndim < 1) or (labels.ndim > 2) or (len(labels) != len(boxes)):
                 raise ValueError(
                     f"Expected target labels to be tensors of shape [N] or [N, num_classes], got {list(labels.shape)}."
@@ -295,7 +310,9 @@ class YOLOV4_Weights(WeightsEnum):
     )
 
 
-def freeze_backbone_layers(backbone: nn.Module, trainable_layers: Optional[int], is_trained: bool) -> None:
+def freeze_backbone_layers(
+    backbone: nn.Module, trainable_layers: Optional[int], is_trained: bool
+) -> None:
     """Freezes backbone layers layers that won't be used for training.
 
     Args:
@@ -331,8 +348,7 @@ def yolov4(
     detections_per_image: int = 300,
     **kwargs: Any,
 ) -> YOLO:
-    """
-    Constructs a YOLOv4 model.
+    """Constructs a YOLOv4 model.
 
     .. betastatus:: detection module
 
@@ -376,11 +392,15 @@ def yolov4(
 
     if weights is not None:
         weights_backbone = None
-        num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
+        num_classes = _ovewrite_value_param(
+            "num_classes", num_classes, len(weights.meta["categories"])
+        )
     elif num_classes is None:
         num_classes = 91
 
-    backbone_kwargs = {key: kwargs[key] for key in ("widths", "activation", "normalization") if key in kwargs}
+    backbone_kwargs = {
+        key: kwargs[key] for key in ("widths", "activation", "normalization") if key in kwargs
+    }
     backbone = YOLOV4Backbone(in_channels, **backbone_kwargs)
 
     is_trained = weights is not None or weights_backbone is not None
@@ -406,8 +426,7 @@ def yolo_darknet(
     detections_per_image: int = 300,
     **kwargs: Any,
 ) -> YOLO:
-    """
-    Constructs a YOLO model from a Darknet configuration file.
+    """Constructs a YOLO model from a Darknet configuration file.
 
     .. betastatus:: detection module
 
diff --git a/mart/models/detection/yolo_loss.py b/mart/models/detection/yolo_loss.py
index e6bd69da..f0e98609 100644
--- a/mart/models/detection/yolo_loss.py
+++ b/mart/models/detection/yolo_loss.py
@@ -4,8 +4,11 @@
 
 import torch
 from torch import Tensor
-from torch.nn.functional import binary_cross_entropy, binary_cross_entropy_with_logits, one_hot
-
+from torch.nn.functional import (
+    binary_cross_entropy,
+    binary_cross_entropy_with_logits,
+    one_hot,
+)
 from torchvision.ops import (
     box_iou,
     complete_box_iou,
@@ -87,19 +90,24 @@ def _pairwise_confidence_loss(
         targets = torch.ones_like(preds) - predict_overlap
         # Distance-IoU may return negative "overlaps", so we have to make sure that the targets are not negative.
         targets += predict_overlap * overlap.detach().clamp(min=0)
-        return _binary_cross_entropy(preds, targets, reduction="none", input_is_normalized=input_is_normalized)
+        return _binary_cross_entropy(
+            preds, targets, reduction="none", input_is_normalized=input_is_normalized
+        )
     else:
         # When not predicting overlap, target confidence is the same for every prediction, but we should still return a
         # matrix.
         targets = torch.ones_like(preds)
-        result = _binary_cross_entropy(preds, targets, reduction="none", input_is_normalized=input_is_normalized)
+        result = _binary_cross_entropy(
+            preds, targets, reduction="none", input_is_normalized=input_is_normalized
+        )
         return result.unsqueeze(1).expand(overlap.shape)
 
 
 def _foreground_confidence_loss(
     preds: Tensor, overlap: Tensor, input_is_normalized: bool, predict_overlap: Optional[float]
 ) -> Tensor:
-    """Calculates the sum of the confidence losses for foreground anchors and their matched targets.
+    """Calculates the sum of the confidence losses for foreground anchors and their matched
+    targets.
 
     If ``predict_overlap`` is ``None``, the target confidence will be 1. If ``predict_overlap`` is 1.0, ``overlap`` will
     be used as the target confidence. Otherwise this parameter defines a balance between these two targets. The method
@@ -120,7 +128,9 @@ def _foreground_confidence_loss(
         targets -= predict_overlap
         # Distance-IoU may return negative "overlaps", so we have to make sure that the targets are not negative.
         targets += predict_overlap * overlap.detach().clamp(min=0)
-    return _binary_cross_entropy(preds, targets, reduction="sum", input_is_normalized=input_is_normalized)
+    return _binary_cross_entropy(
+        preds, targets, reduction="sum", input_is_normalized=input_is_normalized
+    )
 
 
 def _background_confidence_loss(preds: Tensor, input_is_normalized: bool) -> Tensor:
@@ -134,13 +144,16 @@ def _background_confidence_loss(preds: Tensor, input_is_normalized: bool) -> Ten
         The sum of the background confidence losses.
     """
     targets = torch.zeros_like(preds)
-    return _binary_cross_entropy(preds, targets, reduction="sum", input_is_normalized=input_is_normalized)
+    return _binary_cross_entropy(
+        preds, targets, reduction="sum", input_is_normalized=input_is_normalized
+    )
 
 
 def _target_labels_to_probs(
     targets: Tensor, num_classes: int, dtype: torch.dtype, label_smoothing: Optional[float] = None
 ) -> Tensor:
-    """If ``targets`` is a vector of class labels, converts it to a matrix of one-hot class probabilities.
+    """If ``targets`` is a vector of class labels, converts it to a matrix of one-hot class
+    probabilities.
 
     If label smoothing is disabled, the returned target probabilities will be binary. If label smoothing is enabled, the
     target probabilities will be, ``(label_smoothing / 2)`` or ``(label_smoothing / 2) + (1.0 - label_smoothing)``. That
@@ -277,8 +290,9 @@ def elementwise_sums(
         input_is_normalized: bool,
         image_size: Tensor,
     ) -> Losses:
-        """Calculates the sums of the losses for optimization, over prediction/target pairs, assuming the
-        predictions and targets have been matched (there are as many predictions and targets).
+        """Calculates the sums of the losses for optimization, over prediction/target pairs,
+        assuming the predictions and targets have been matched (there are as many predictions and
+        targets).
 
         Args:
             preds: A dictionary of predictions, containing "boxes", "confidences", and "classprobs".
@@ -296,7 +310,9 @@ def elementwise_sums(
         confidence_loss = _foreground_confidence_loss(
             preds["confidences"], overlap, input_is_normalized, self.predict_overlap
         )
-        confidence_loss += _background_confidence_loss(preds["bg_confidences"], input_is_normalized)
+        confidence_loss += _background_confidence_loss(
+            preds["bg_confidences"], input_is_normalized
+        )
 
         pred_probs = preds["classprobs"]
         target_probs = _target_labels_to_probs(
diff --git a/mart/models/detection/yolo_networks.py b/mart/models/detection/yolo_networks.py
index 26224ca8..389d3611 100644
--- a/mart/models/detection/yolo_networks.py
+++ b/mart/models/detection/yolo_networks.py
@@ -9,11 +9,11 @@
 import torch
 import torch.nn as nn
 from torch import Tensor
-
 from torchvision.ops import box_convert
+
 from ..yolo import (
-    Conv,
     CSPSPP,
+    Conv,
     CSPStage,
     ELANStage,
     FastSPP,
@@ -26,7 +26,13 @@
     YOLOV7Backbone,
 )
 from .anchor_utils import global_xy
-from .target_matching import HighestIoUMatching, IoUThresholdMatching, PRIOR_SHAPES, SimOTAMatching, SizeRatioMatching
+from .target_matching import (
+    PRIOR_SHAPES,
+    HighestIoUMatching,
+    IoUThresholdMatching,
+    SimOTAMatching,
+    SizeRatioMatching,
+)
 from .yolo_loss import YOLOLoss
 
 DARKNET_CONFIG = Dict[str, Any]
@@ -138,7 +144,10 @@ def forward(self, x: Tensor, image_size: Tensor) -> Tuple[Tensor, PREDS]:
 
         # It's better to use binary_cross_entropy_with_logits() for loss computation, so we'll provide the unnormalized
         # confidence and classprob, when available.
-        preds = [{"boxes": b, "confidences": c, "classprobs": p} for b, c, p in zip(box, confidence, classprob)]
+        preds = [
+            {"boxes": b, "confidences": c, "classprobs": p}
+            for b, c, p in zip(box, confidence, classprob)
+        ]
 
         return output, preds
 
@@ -172,7 +181,9 @@ def match_targets(
         pred_boxes = torch.empty((0, 4), device=return_preds[0]["boxes"].device)
         pred_confidences = torch.empty(0, device=return_preds[0]["confidences"].device)
         pred_bg_confidences = torch.empty(0, device=return_preds[0]["confidences"].device)
-        pred_classprobs = torch.empty((0, self.num_classes), device=return_preds[0]["classprobs"].device)
+        pred_classprobs = torch.empty(
+            (0, self.num_classes), device=return_preds[0]["classprobs"].device
+        )
         target_boxes = torch.empty((0, 4), device=targets[0]["boxes"].device)
         target_labels = torch.empty(0, dtype=torch.int64, device=targets[0]["labels"].device)
 
@@ -182,15 +193,23 @@ def match_targets(
                     image_preds, image_targets, image_size
                 )
                 pred_boxes = torch.cat((pred_boxes, image_return_preds["boxes"][pred_selector]))
-                pred_confidences = torch.cat((pred_confidences, image_return_preds["confidences"][pred_selector]))
+                pred_confidences = torch.cat(
+                    (pred_confidences, image_return_preds["confidences"][pred_selector])
+                )
                 pred_bg_confidences = torch.cat(
                     (pred_bg_confidences, image_return_preds["confidences"][background_selector])
                 )
-                pred_classprobs = torch.cat((pred_classprobs, image_return_preds["classprobs"][pred_selector]))
+                pred_classprobs = torch.cat(
+                    (pred_classprobs, image_return_preds["classprobs"][pred_selector])
+                )
                 target_boxes = torch.cat((target_boxes, image_targets["boxes"][target_selector]))
-                target_labels = torch.cat((target_labels, image_targets["labels"][target_selector]))
+                target_labels = torch.cat(
+                    (target_labels, image_targets["labels"][target_selector])
+                )
             else:
-                pred_bg_confidences = torch.cat((pred_bg_confidences, image_return_preds["confidences"].flatten()))
+                pred_bg_confidences = torch.cat(
+                    (pred_bg_confidences, image_return_preds["confidences"].flatten())
+                )
 
         matched_preds = {
             "boxes": pred_boxes,
@@ -231,8 +250,12 @@ def calculate_losses(
 
         matched_preds, matched_targets = self.match_targets(preds, loss_preds, targets, image_size)
 
-        losses = self.loss_func.elementwise_sums(matched_preds, matched_targets, self.input_is_normalized, image_size)
-        losses = torch.stack((losses.overlap, losses.confidence, losses.classification)) / len(preds)
+        losses = self.loss_func.elementwise_sums(
+            matched_preds, matched_targets, self.input_is_normalized, image_size
+        )
+        losses = torch.stack((losses.overlap, losses.confidence, losses.classification)) / len(
+            preds
+        )
 
         hits = len(matched_targets["boxes"])
 
@@ -296,17 +319,28 @@ def create_detection_layer(
     matching_func: Callable
     if matching_algorithm == "simota":
         loss_func = YOLOLoss(
-            overlap_func, None, None, overlap_loss_multiplier, confidence_loss_multiplier, class_loss_multiplier
+            overlap_func,
+            None,
+            None,
+            overlap_loss_multiplier,
+            confidence_loss_multiplier,
+            class_loss_multiplier,
+        )
+        matching_func = SimOTAMatching(
+            prior_shapes, prior_shape_idxs, loss_func, spatial_range, size_range
         )
-        matching_func = SimOTAMatching(prior_shapes, prior_shape_idxs, loss_func, spatial_range, size_range)
     elif matching_algorithm == "size":
         if matching_threshold is None:
             raise ValueError("matching_threshold is required with size ratio matching.")
-        matching_func = SizeRatioMatching(prior_shapes, prior_shape_idxs, matching_threshold, ignore_bg_threshold)
+        matching_func = SizeRatioMatching(
+            prior_shapes, prior_shape_idxs, matching_threshold, ignore_bg_threshold
+        )
     elif matching_algorithm == "iou":
         if matching_threshold is None:
             raise ValueError("matching_threshold is required with IoU threshold matching.")
-        matching_func = IoUThresholdMatching(prior_shapes, prior_shape_idxs, matching_threshold, ignore_bg_threshold)
+        matching_func = IoUThresholdMatching(
+            prior_shapes, prior_shape_idxs, matching_threshold, ignore_bg_threshold
+        )
     elif matching_algorithm == "maxiou" or matching_algorithm is None:
         matching_func = HighestIoUMatching(prior_shapes, prior_shape_idxs, ignore_bg_threshold)
     else:
@@ -321,14 +355,16 @@ def create_detection_layer(
         class_loss_multiplier,
     )
     layer_shapes = [prior_shapes[i] for i in prior_shape_idxs]
-    return DetectionLayer(prior_shapes=layer_shapes, matching_func=matching_func, loss_func=loss_func, **kwargs)
+    return DetectionLayer(
+        prior_shapes=layer_shapes, matching_func=matching_func, loss_func=loss_func, **kwargs
+    )
 
 
 class DetectionStage(nn.Module):
     """This is a convenience class for running a detection layer.
 
-    It might be cleaner to implement this as a function, but TorchScript allows only specific types in function
-    arguments, not modules.
+    It might be cleaner to implement this as a function, but TorchScript allows only specific types
+    in function arguments, not modules.
     """
 
     def __init__(self, **kwargs: Any) -> None:
@@ -344,7 +380,8 @@ def forward(
         losses: List[Tensor],
         hits: List[int],
     ) -> None:
-        """Runs the detection layer on the inputs and appends the output to the ``detections`` list.
+        """Runs the detection layer on the inputs and appends the output to the ``detections``
+        list.
 
         If ``targets`` is given, also calculates the losses and appends to the ``losses`` list.
 
@@ -361,7 +398,9 @@ def forward(
         detections.append(output)
 
         if targets is not None:
-            layer_losses, layer_hits = self.detection_layer.calculate_losses(preds, targets, image_size)
+            layer_losses, layer_hits = self.detection_layer.calculate_losses(
+                preds, targets, image_size
+            )
             losses.append(layer_losses)
             hits.append(layer_hits)
 
@@ -378,11 +417,17 @@ class DetectionStageWithAux(nn.Module):
     """
 
     def __init__(
-        self, spatial_range: float = 5.0, aux_spatial_range: float = 3.0, aux_weight: float = 0.25, **kwargs: Any
+        self,
+        spatial_range: float = 5.0,
+        aux_spatial_range: float = 3.0,
+        aux_weight: float = 0.25,
+        **kwargs: Any,
     ) -> None:
         super().__init__()
         self.detection_layer = create_detection_layer(spatial_range=spatial_range, **kwargs)
-        self.aux_detection_layer = create_detection_layer(spatial_range=aux_spatial_range, **kwargs)
+        self.aux_detection_layer = create_detection_layer(
+            spatial_range=aux_spatial_range, **kwargs
+        )
         self.aux_weight = aux_weight
 
     def forward(
@@ -395,8 +440,8 @@ def forward(
         losses: List[Tensor],
         hits: List[int],
     ) -> None:
-        """Runs the detection layer and the auxiliary detection layer on their respective inputs and appends the
-        outputs to the ``detections`` list.
+        """Runs the detection layer and the auxiliary detection layer on their respective inputs
+        and appends the outputs to the ``detections`` list.
 
         If ``targets`` is given, also calculates the losses and appends to the ``losses`` list.
 
@@ -415,7 +460,9 @@ def forward(
 
         if targets is not None:
             # Match lead head predictions to targets and calculate losses from lead head outputs.
-            layer_losses, layer_hits = self.detection_layer.calculate_losses(preds, targets, image_size)
+            layer_losses, layer_hits = self.detection_layer.calculate_losses(
+                preds, targets, image_size
+            )
             losses.append(layer_losses)
             hits.append(layer_hits)
 
@@ -522,7 +569,14 @@ def __init__(
         num_outputs = (5 + num_classes) * anchors_per_cell
 
         def conv(in_channels: int, out_channels: int, kernel_size: int = 1) -> nn.Module:
-            return Conv(in_channels, out_channels, kernel_size, stride=1, activation=activation, norm=normalization)
+            return Conv(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride=1,
+                activation=activation,
+                norm=normalization,
+            )
 
         def upsample(in_channels: int, out_channels: int) -> nn.Module:
             channels = conv(in_channels, out_channels)
@@ -542,7 +596,9 @@ def detect(prior_shape_idxs: Sequence[int]) -> DetectionStage:
                 **kwargs,
             )
 
-        self.backbone = backbone or YOLOV4TinyBackbone(width=width, activation=activation, normalization=normalization)
+        self.backbone = backbone or YOLOV4TinyBackbone(
+            width=width, activation=activation, normalization=normalization
+        )
 
         self.fpn5 = conv(width * 16, width * 8)
         self.out5 = nn.Sequential(
@@ -588,7 +644,8 @@ def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPU
 
 
 class YOLOV4Network(nn.Module):
-    """Network architecture that corresponds approximately to the Cross Stage Partial Network from YOLOv4.
+    """Network architecture that corresponds approximately to the Cross Stage Partial Network from
+    YOLOv4.
 
     Args:
         num_classes: Number of different classes that this model predicts.
@@ -664,7 +721,14 @@ def spp(in_channels: int, out_channels: int) -> nn.Module:
             return CSPSPP(in_channels, out_channels, activation=activation, norm=normalization)
 
         def conv(in_channels: int, out_channels: int) -> nn.Module:
-            return Conv(in_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=normalization)
+            return Conv(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                activation=activation,
+                norm=normalization,
+            )
 
         def csp(in_channels: int, out_channels: int) -> nn.Module:
             return CSPStage(
@@ -677,9 +741,18 @@ def csp(in_channels: int, out_channels: int) -> nn.Module:
             )
 
         def out(in_channels: int) -> nn.Module:
-            conv = Conv(in_channels, in_channels, kernel_size=3, stride=1, activation=activation, norm=normalization)
+            conv = Conv(
+                in_channels,
+                in_channels,
+                kernel_size=3,
+                stride=1,
+                activation=activation,
+                norm=normalization,
+            )
             outputs = nn.Conv2d(in_channels, num_outputs, kernel_size=1)
-            return nn.Sequential(OrderedDict([("conv", conv), (f"outputs_{num_outputs}", outputs)]))
+            return nn.Sequential(
+                OrderedDict([("conv", conv), (f"outputs_{num_outputs}", outputs)])
+            )
 
         def upsample(in_channels: int, out_channels: int) -> nn.Module:
             channels = conv(in_channels, out_channels)
@@ -687,7 +760,14 @@ def upsample(in_channels: int, out_channels: int) -> nn.Module:
             return nn.Sequential(OrderedDict([("channels", channels), ("upsample", upsample)]))
 
         def downsample(in_channels: int, out_channels: int) -> nn.Module:
-            return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
+            return Conv(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=2,
+                activation=activation,
+                norm=normalization,
+            )
 
         def detect(prior_shape_idxs: Sequence[int]) -> DetectionStage:
             assert prior_shapes is not None
@@ -702,7 +782,9 @@ def detect(prior_shape_idxs: Sequence[int]) -> DetectionStage:
         if backbone is not None:
             self.backbone = backbone
         else:
-            self.backbone = YOLOV4Backbone(widths=widths, activation=activation, normalization=normalization)
+            self.backbone = YOLOV4Backbone(
+                widths=widths, activation=activation, normalization=normalization
+            )
 
         w3 = widths[-3]
         w4 = widths[-2]
@@ -758,7 +840,8 @@ def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPU
 
 
 class YOLOV4P6Network(nn.Module):
-    """Network architecture that corresponds approximately to the variant of YOLOv4 with four detection layers.
+    """Network architecture that corresponds approximately to the variant of YOLOv4 with four
+    detection layers.
 
     Args:
         num_classes: Number of different classes that this model predicts.
@@ -841,7 +924,14 @@ def spp(in_channels: int, out_channels: int) -> nn.Module:
             return CSPSPP(in_channels, out_channels, activation=activation, norm=normalization)
 
         def conv(in_channels: int, out_channels: int) -> nn.Module:
-            return Conv(in_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=normalization)
+            return Conv(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                activation=activation,
+                norm=normalization,
+            )
 
         def csp(in_channels: int, out_channels: int) -> nn.Module:
             return CSPStage(
@@ -854,9 +944,18 @@ def csp(in_channels: int, out_channels: int) -> nn.Module:
             )
 
         def out(in_channels: int) -> nn.Module:
-            conv = Conv(in_channels, in_channels, kernel_size=3, stride=1, activation=activation, norm=normalization)
+            conv = Conv(
+                in_channels,
+                in_channels,
+                kernel_size=3,
+                stride=1,
+                activation=activation,
+                norm=normalization,
+            )
             outputs = nn.Conv2d(in_channels, num_outputs, kernel_size=1)
-            return nn.Sequential(OrderedDict([("conv", conv), (f"outputs_{num_outputs}", outputs)]))
+            return nn.Sequential(
+                OrderedDict([("conv", conv), (f"outputs_{num_outputs}", outputs)])
+            )
 
         def upsample(in_channels: int, out_channels: int) -> nn.Module:
             channels = conv(in_channels, out_channels)
@@ -864,7 +963,14 @@ def upsample(in_channels: int, out_channels: int) -> nn.Module:
             return nn.Sequential(OrderedDict([("channels", channels), ("upsample", upsample)]))
 
         def downsample(in_channels: int, out_channels: int) -> nn.Module:
-            return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
+            return Conv(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=2,
+                activation=activation,
+                norm=normalization,
+            )
 
         def detect(prior_shape_idxs: Sequence[int]) -> DetectionStage:
             assert prior_shapes is not None
@@ -880,7 +986,10 @@ def detect(prior_shape_idxs: Sequence[int]) -> DetectionStage:
             self.backbone = backbone
         else:
             self.backbone = YOLOV4Backbone(
-                widths=widths, depths=(1, 1, 3, 15, 15, 7, 7), activation=activation, normalization=normalization
+                widths=widths,
+                depths=(1, 1, 3, 15, 15, 7, 7),
+                activation=activation,
+                normalization=normalization,
             )
 
         w3 = widths[-4]
@@ -952,8 +1061,8 @@ def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPU
 
 
 class YOLOV5Network(nn.Module):
-    """The YOLOv5 network architecture. Different variants (n/s/m/l/x) can be achieved by adjusting the ``depth``
-    and ``width`` parameters.
+    """The YOLOv5 network architecture. Different variants (n/s/m/l/x) can be achieved by adjusting
+    the ``depth`` and ``width`` parameters.
 
     Args:
         num_classes: Number of different classes that this model predicts.
@@ -1034,10 +1143,24 @@ def spp(in_channels: int, out_channels: int) -> nn.Module:
             return FastSPP(in_channels, out_channels, activation=activation, norm=normalization)
 
         def downsample(in_channels: int, out_channels: int) -> nn.Module:
-            return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
+            return Conv(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=2,
+                activation=activation,
+                norm=normalization,
+            )
 
         def conv(in_channels: int, out_channels: int) -> nn.Module:
-            return Conv(in_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=normalization)
+            return Conv(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                activation=activation,
+                norm=normalization,
+            )
 
         def out(in_channels: int) -> nn.Module:
             outputs = nn.Conv2d(in_channels, num_outputs, kernel_size=1)
@@ -1124,7 +1247,8 @@ def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPU
 
 
 class YOLOV7Network(nn.Module):
-    """Network architecture that corresponds to the W6 variant of YOLOv7 with four detection layers.
+    """Network architecture that corresponds to the W6 variant of YOLOv7 with four detection
+    layers.
 
     Args:
         num_classes: Number of different classes that this model predicts.
@@ -1210,7 +1334,14 @@ def spp(in_channels: int, out_channels: int) -> nn.Module:
             return CSPSPP(in_channels, out_channels, activation=activation, norm=normalization)
 
         def conv(in_channels: int, out_channels: int) -> nn.Module:
-            return Conv(in_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=normalization)
+            return Conv(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                activation=activation,
+                norm=normalization,
+            )
 
         def elan(in_channels: int, out_channels: int) -> nn.Module:
             return ELANStage(
@@ -1225,10 +1356,17 @@ def elan(in_channels: int, out_channels: int) -> nn.Module:
 
         def out(in_channels: int, hidden_channels: int) -> nn.Module:
             conv = Conv(
-                in_channels, hidden_channels, kernel_size=3, stride=1, activation=activation, norm=normalization
+                in_channels,
+                hidden_channels,
+                kernel_size=3,
+                stride=1,
+                activation=activation,
+                norm=normalization,
             )
             outputs = nn.Conv2d(hidden_channels, num_outputs, kernel_size=1)
-            return nn.Sequential(OrderedDict([("conv", conv), (f"outputs_{num_outputs}", outputs)]))
+            return nn.Sequential(
+                OrderedDict([("conv", conv), (f"outputs_{num_outputs}", outputs)])
+            )
 
         def upsample(in_channels: int, out_channels: int) -> nn.Module:
             channels = conv(in_channels, out_channels)
@@ -1236,7 +1374,14 @@ def upsample(in_channels: int, out_channels: int) -> nn.Module:
             return nn.Sequential(OrderedDict([("channels", channels), ("upsample", upsample)]))
 
         def downsample(in_channels: int, out_channels: int) -> nn.Module:
-            return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
+            return Conv(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=2,
+                activation=activation,
+                norm=normalization,
+            )
 
         def detect(prior_shape_idxs: Sequence[int]) -> DetectionStageWithAux:
             assert prior_shapes is not None
@@ -1252,7 +1397,11 @@ def detect(prior_shape_idxs: Sequence[int]) -> DetectionStageWithAux:
             self.backbone = backbone
         else:
             self.backbone = YOLOV7Backbone(
-                widths=widths, depth=2, block_depth=2, activation=activation, normalization=normalization
+                widths=widths,
+                depth=2,
+                block_depth=2,
+                activation=activation,
+                normalization=normalization,
             )
 
         w3 = widths[-4]
@@ -1320,16 +1469,24 @@ def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPU
         x = torch.cat((self.downsample5(n5), c6), dim=1)
         n6 = self.pan6(x)
 
-        self.detect3(self.out3(n3), self.aux_out3(n3), targets, image_size, detections, losses, hits)
-        self.detect4(self.out4(n4), self.aux_out4(p4), targets, image_size, detections, losses, hits)
-        self.detect5(self.out5(n5), self.aux_out5(p5), targets, image_size, detections, losses, hits)
-        self.detect6(self.out6(n6), self.aux_out6(c6), targets, image_size, detections, losses, hits)
+        self.detect3(
+            self.out3(n3), self.aux_out3(n3), targets, image_size, detections, losses, hits
+        )
+        self.detect4(
+            self.out4(n4), self.aux_out4(p4), targets, image_size, detections, losses, hits
+        )
+        self.detect5(
+            self.out5(n5), self.aux_out5(p5), targets, image_size, detections, losses, hits
+        )
+        self.detect6(
+            self.out6(n6), self.aux_out6(c6), targets, image_size, detections, losses, hits
+        )
         return detections, losses, hits
 
 
 class YOLOXHead(nn.Module):
-    """A module that produces features for YOLO detection layer, decoupling the classification and localization
-    features.
+    """A module that produces features for YOLO detection layer, decoupling the classification and
+    localization features.
 
     Args:
         in_channels: Number of input channels that the module expects.
@@ -1353,7 +1510,9 @@ def __init__(
         super().__init__()
 
         def conv(in_channels: int, out_channels: int, kernel_size: int = 1) -> nn.Module:
-            return Conv(in_channels, out_channels, kernel_size, stride=1, activation=activation, norm=norm)
+            return Conv(
+                in_channels, out_channels, kernel_size, stride=1, activation=activation, norm=norm
+            )
 
         def linear(in_channels: int, out_channels: int) -> nn.Module:
             return nn.Conv2d(in_channels, out_channels, kernel_size=1)
@@ -1367,7 +1526,11 @@ def features(num_channels: int) -> nn.Module:
         def classprob(num_channels: int) -> nn.Module:
             num_outputs = anchors_per_cell * num_classes
             outputs = linear(num_channels, num_outputs)
-            return nn.Sequential(OrderedDict([("convs", features(num_channels)), (f"outputs_{num_outputs}", outputs)]))
+            return nn.Sequential(
+                OrderedDict(
+                    [("convs", features(num_channels)), (f"outputs_{num_outputs}", outputs)]
+                )
+            )
 
         self.stem = conv(in_channels, hidden_channels)
         self.feat = features(hidden_channels)
@@ -1385,8 +1548,8 @@ def forward(self, x: Tensor) -> Tensor:
 
 
 class YOLOXNetwork(nn.Module):
-    """The YOLOX network architecture. Different variants (nano/tiny/s/m/l/x) can be achieved by adjusting the
-    ``depth`` and ``width`` parameters.
+    """The YOLOX network architecture. Different variants (nano/tiny/s/m/l/x) can be achieved by
+    adjusting the ``depth`` and ``width`` parameters.
 
     Args:
         num_classes: Number of different classes that this model predicts.
@@ -1456,10 +1619,24 @@ def spp(in_channels: int, out_channels: int) -> nn.Module:
             return FastSPP(in_channels, out_channels, activation=activation, norm=normalization)
 
         def downsample(in_channels: int, out_channels: int) -> nn.Module:
-            return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
+            return Conv(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=2,
+                activation=activation,
+                norm=normalization,
+            )
 
         def conv(in_channels: int, out_channels: int, kernel_size: int = 1) -> nn.Module:
-            return Conv(in_channels, out_channels, kernel_size, stride=1, activation=activation, norm=normalization)
+            return Conv(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride=1,
+                activation=activation,
+                norm=normalization,
+            )
 
         def csp(in_channels: int, out_channels: int) -> nn.Module:
             return CSPStage(
@@ -1552,7 +1729,8 @@ def forward(self, x: Tensor, targets: Optional[TARGETS] = None) -> NETWORK_OUTPU
 
 
 class DarknetNetwork(nn.Module):
-    """This class can be used to parse the configuration files of the Darknet YOLOv4 implementation.
+    """This class can be used to parse the configuration files of the Darknet YOLOv4
+    implementation.
 
     Iterates through the layers from the configuration and creates corresponding PyTorch modules. If ``weights_path`` is
     given and points to a Darknet model file, loads the convolutional layer weights from the file.
@@ -1586,7 +1764,11 @@ class DarknetNetwork(nn.Module):
     """
 
     def __init__(
-        self, config_path: str, weights_path: Optional[str] = None, in_channels: Optional[int] = None, **kwargs: Any
+        self,
+        config_path: str,
+        weights_path: Optional[str] = None,
+        in_channels: Optional[int] = None,
+        **kwargs: Any,
     ) -> None:
         super().__init__()
 
@@ -1672,7 +1854,8 @@ def load_weights(self, weight_file: io.IOBase) -> None:
         def read(tensor: Tensor) -> int:
             """Reads the contents of ``tensor`` from the current position of ``weight_file``.
 
-            Returns the number of elements read. If there's no more data in ``weight_file``, returns 0.
+            Returns the number of elements read. If there's no more data in ``weight_file``,
+            returns 0.
             """
             np_array = np.fromfile(weight_file, count=tensor.numel(), dtype=np.float32)
             num_elements = np_array.size
@@ -1791,7 +1974,9 @@ def convert(key: str, value: str) -> Union[str, int, float, List[Union[str, int,
                 section = {"type": section_match.group(1)}
             else:
                 if section is None:
-                    raise RuntimeError("Darknet network configuration file does not start with a section header.")
+                    raise RuntimeError(
+                        "Darknet network configuration file does not start with a section header."
+                    )
                 key, value = line.split("=")
                 key = key.rstrip()
                 value = value.lstrip()
@@ -1802,9 +1987,11 @@ def convert(key: str, value: str) -> Union[str, int, float, List[Union[str, int,
         return sections
 
 
-def _create_layer(config: DARKNET_CONFIG, num_inputs: List[int], **kwargs: Any) -> CREATE_LAYER_OUTPUT:
-    """Calls one of the ``_create_<layertype>(config, num_inputs)`` functions to create a PyTorch module from the
-    layer config.
+def _create_layer(
+    config: DARKNET_CONFIG, num_inputs: List[int], **kwargs: Any
+) -> CREATE_LAYER_OUTPUT:
+    """Calls one of the ``_create_<layertype>(config, num_inputs)`` functions to create a PyTorch
+    module from the layer config.
 
     Args:
         config: Dictionary of configuration options for this layer.
@@ -1825,7 +2012,9 @@ def _create_layer(config: DARKNET_CONFIG, num_inputs: List[int], **kwargs: Any)
     return create_func[config["type"]](config, num_inputs, **kwargs)
 
 
-def _create_convolutional(config: DARKNET_CONFIG, num_inputs: List[int], **kwargs: Any) -> CREATE_LAYER_OUTPUT:
+def _create_convolutional(
+    config: DARKNET_CONFIG, num_inputs: List[int], **kwargs: Any
+) -> CREATE_LAYER_OUTPUT:
     """Creates a convolutional layer.
 
     Args:
@@ -1852,7 +2041,9 @@ def _create_convolutional(config: DARKNET_CONFIG, num_inputs: List[int], **kwarg
     return layer, config["filters"]
 
 
-def _create_maxpool(config: DARKNET_CONFIG, num_inputs: List[int], **kwargs: Any) -> CREATE_LAYER_OUTPUT:
+def _create_maxpool(
+    config: DARKNET_CONFIG, num_inputs: List[int], **kwargs: Any
+) -> CREATE_LAYER_OUTPUT:
     """Creates a max pooling layer.
 
     Padding is added so that the output resolution will be the input resolution divided by stride, rounded upwards.
@@ -1869,7 +2060,9 @@ def _create_maxpool(config: DARKNET_CONFIG, num_inputs: List[int], **kwargs: Any
     return layer, num_inputs[-1]
 
 
-def _create_route(config: DARKNET_CONFIG, num_inputs: List[int], **kwargs: Any) -> CREATE_LAYER_OUTPUT:
+def _create_route(
+    config: DARKNET_CONFIG, num_inputs: List[int], **kwargs: Any
+) -> CREATE_LAYER_OUTPUT:
     """Creates a routing layer.
 
     A routing layer concatenates the output (or part of it) from the layers specified by the "layers" configuration
@@ -1898,7 +2091,9 @@ def _create_route(config: DARKNET_CONFIG, num_inputs: List[int], **kwargs: Any)
     return layer, num_outputs
 
 
-def _create_shortcut(config: DARKNET_CONFIG, num_inputs: List[int], **kwargs: Any) -> CREATE_LAYER_OUTPUT:
+def _create_shortcut(
+    config: DARKNET_CONFIG, num_inputs: List[int], **kwargs: Any
+) -> CREATE_LAYER_OUTPUT:
     """Creates a shortcut layer.
 
     A shortcut layer adds a residual connection from the layer specified by the "from" configuration option.
@@ -1915,7 +2110,9 @@ def _create_shortcut(config: DARKNET_CONFIG, num_inputs: List[int], **kwargs: An
     return layer, num_inputs[-1]
 
 
-def _create_upsample(config: DARKNET_CONFIG, num_inputs: List[int], **kwargs: Any) -> CREATE_LAYER_OUTPUT:
+def _create_upsample(
+    config: DARKNET_CONFIG, num_inputs: List[int], **kwargs: Any
+) -> CREATE_LAYER_OUTPUT:
     """Creates a layer that upsamples the data.
 
     Args:
diff --git a/mart/models/yolo.py b/mart/models/yolo.py
index 34f4fdbc..63a7c555 100644
--- a/mart/models/yolo.py
+++ b/mart/models/yolo.py
@@ -3,7 +3,7 @@
 from typing import List, Optional, Sequence, Tuple
 
 import torch
-from torch import nn, Tensor
+from torch import Tensor, nn
 
 
 def _get_padding(kernel_size: int, stride: int) -> Tuple[int, nn.Module]:
@@ -124,8 +124,9 @@ def forward(self, x: Tensor) -> Tensor:
 class MaxPool(nn.Module):
     """A max pooling layer with padding.
 
-    The module tries to add padding so much that the output size will be the input size divided by the stride. If the
-    input size is not divisible by the stride, the output size will be rounded upwards.
+    The module tries to add padding so much that the output size will be the input size divided by
+    the stride. If the input size is not divisible by the stride, the output size will be rounded
+    upwards.
     """
 
     def __init__(self, kernel_size: int, stride: int):
@@ -154,7 +155,10 @@ def __init__(self, source_layers: List[int], num_chunks: int, chunk_idx: int) ->
         self.chunk_idx = chunk_idx
 
     def forward(self, outputs: List[Tensor]) -> Tensor:
-        chunks = [torch.chunk(outputs[layer], self.num_chunks, dim=1)[self.chunk_idx] for layer in self.source_layers]
+        chunks = [
+            torch.chunk(outputs[layer], self.num_chunks, dim=1)[self.chunk_idx]
+            for layer in self.source_layers
+        ]
         return torch.cat(chunks, dim=1)
 
 
@@ -181,7 +185,8 @@ def forward(self, x: Tensor) -> Tensor:
 
 
 class ReOrg(nn.Module):
-    """Re-organizes the tensor so that every square region of four cells is placed into four different channels.
+    """Re-organizes the tensor so that every square region of four cells is placed into four
+    different channels.
 
     The result is a tensor with half the width and height, and four times as many channels.
     """
@@ -223,8 +228,22 @@ def __init__(
             hidden_channels = out_channels
 
         self.convs = nn.Sequential(
-            Conv(in_channels, hidden_channels, kernel_size=1, stride=1, activation=activation, norm=norm),
-            Conv(hidden_channels, out_channels, kernel_size=3, stride=1, activation=activation, norm=norm),
+            Conv(
+                in_channels,
+                hidden_channels,
+                kernel_size=1,
+                stride=1,
+                activation=activation,
+                norm=norm,
+            ),
+            Conv(
+                hidden_channels,
+                out_channels,
+                kernel_size=3,
+                stride=1,
+                activation=activation,
+                norm=norm,
+            ),
         )
         self.shortcut = shortcut and in_channels == out_channels
 
@@ -253,9 +272,25 @@ def __init__(
         super().__init__()
 
         hidden_channels = num_channels // 2
-        self.conv1 = Conv(hidden_channels, hidden_channels, kernel_size=3, stride=1, activation=activation, norm=norm)
-        self.conv2 = Conv(hidden_channels, hidden_channels, kernel_size=3, stride=1, activation=activation, norm=norm)
-        self.mix = Conv(num_channels, num_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
+        self.conv1 = Conv(
+            hidden_channels,
+            hidden_channels,
+            kernel_size=3,
+            stride=1,
+            activation=activation,
+            norm=norm,
+        )
+        self.conv2 = Conv(
+            hidden_channels,
+            hidden_channels,
+            kernel_size=3,
+            stride=1,
+            activation=activation,
+            norm=norm,
+        )
+        self.mix = Conv(
+            num_channels, num_channels, kernel_size=1, stride=1, activation=activation, norm=norm
+        )
 
     def forward(self, x: Tensor) -> Tuple[Tensor, Tensor]:
         partial = torch.chunk(x, 2, dim=1)[1]
@@ -298,14 +333,31 @@ def __init__(
         # convolutions with N/2 output channels.
         hidden_channels = out_channels // 2
 
-        self.split1 = Conv(in_channels, hidden_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
-        self.split2 = Conv(in_channels, hidden_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
+        self.split1 = Conv(
+            in_channels, hidden_channels, kernel_size=1, stride=1, activation=activation, norm=norm
+        )
+        self.split2 = Conv(
+            in_channels, hidden_channels, kernel_size=1, stride=1, activation=activation, norm=norm
+        )
         bottlenecks: List[nn.Module] = [
-            BottleneckBlock(hidden_channels, hidden_channels, shortcut=shortcut, norm=norm, activation=activation)
+            BottleneckBlock(
+                hidden_channels,
+                hidden_channels,
+                shortcut=shortcut,
+                norm=norm,
+                activation=activation,
+            )
             for _ in range(depth)
         ]
         self.bottlenecks = nn.Sequential(*bottlenecks)
-        self.mix = Conv(hidden_channels * 2, out_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
+        self.mix = Conv(
+            hidden_channels * 2,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            activation=activation,
+            norm=norm,
+        )
 
     def forward(self, x: Tensor) -> Tensor:
         y1 = self.bottlenecks(self.split1(x))
@@ -350,7 +402,14 @@ def __init__(
         super().__init__()
 
         def conv3x3(in_channels: int, out_channels: int) -> nn.Module:
-            return Conv(in_channels, out_channels, kernel_size=3, stride=1, activation=activation, norm=norm)
+            return Conv(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=1,
+                activation=activation,
+                norm=norm,
+            )
 
         def block(in_channels: int, out_channels: int) -> nn.Module:
             convs = [conv3x3(in_channels, out_channels)]
@@ -367,8 +426,12 @@ def block(in_channels: int, out_channels: int) -> nn.Module:
         if split_channels is None:
             split_channels = hidden_channels
 
-        self.split1 = Conv(in_channels, split_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
-        self.split2 = Conv(in_channels, split_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
+        self.split1 = Conv(
+            in_channels, split_channels, kernel_size=1, stride=1, activation=activation, norm=norm
+        )
+        self.split2 = Conv(
+            in_channels, split_channels, kernel_size=1, stride=1, activation=activation, norm=norm
+        )
 
         blocks = [block(split_channels, hidden_channels)]
         for _ in range(depth - 1):
@@ -376,7 +439,9 @@ def block(in_channels: int, out_channels: int) -> nn.Module:
         self.blocks = nn.ModuleList(blocks)
 
         total_channels = (split_channels * 2) + (hidden_channels * depth)
-        self.mix = Conv(total_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
+        self.mix = Conv(
+            total_channels, out_channels, kernel_size=1, stride=1, activation=activation, norm=norm
+        )
 
     def forward(self, x: Tensor) -> Tensor:
         outputs = [self.split1(x), self.split2(x)]
@@ -408,7 +473,14 @@ def __init__(
         super().__init__()
 
         def conv(in_channels: int, out_channels: int, kernel_size: int = 1) -> nn.Module:
-            return Conv(in_channels, out_channels, kernel_size=kernel_size, stride=1, activation=activation, norm=norm)
+            return Conv(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=1,
+                activation=activation,
+                norm=norm,
+            )
 
         self.conv1 = nn.Sequential(
             conv(in_channels, out_channels),
@@ -457,9 +529,18 @@ def __init__(
     ):
         super().__init__()
         hidden_channels = in_channels // 2
-        self.conv = Conv(in_channels, hidden_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
+        self.conv = Conv(
+            in_channels, hidden_channels, kernel_size=1, stride=1, activation=activation, norm=norm
+        )
         self.maxpool = MaxPool(kernel_size=5, stride=1)
-        self.mix = Conv(hidden_channels * 4, out_channels, kernel_size=1, stride=1, activation=activation, norm=norm)
+        self.mix = Conv(
+            hidden_channels * 4,
+            out_channels,
+            kernel_size=1,
+            stride=1,
+            activation=activation,
+            norm=norm,
+        )
 
     def forward(self, x: Tensor) -> Tensor:
         y1 = self.conv(x)
@@ -491,13 +572,27 @@ def __init__(
         super().__init__()
 
         def smooth(num_channels: int) -> nn.Module:
-            return Conv(num_channels, num_channels, kernel_size=3, stride=1, activation=activation, norm=normalization)
+            return Conv(
+                num_channels,
+                num_channels,
+                kernel_size=3,
+                stride=1,
+                activation=activation,
+                norm=normalization,
+            )
 
         def downsample(in_channels: int, out_channels: int) -> nn.Module:
             conv_module = Conv(
-                in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=2,
+                activation=activation,
+                norm=normalization,
+            )
+            return nn.Sequential(
+                OrderedDict([("downsample", conv_module), ("smooth", smooth(out_channels))])
             )
-            return nn.Sequential(OrderedDict([("downsample", conv_module), ("smooth", smooth(out_channels))]))
 
         def maxpool(out_channels: int) -> nn.Module:
             return nn.Sequential(
@@ -516,10 +611,19 @@ def stage(out_channels: int, use_maxpool: bool) -> nn.Module:
             else:
                 downsample_module = downsample(out_channels // 2, out_channels)
             stage_module = TinyStage(out_channels, activation=activation, norm=normalization)
-            return nn.Sequential(OrderedDict([("downsample", downsample_module), ("stage", stage_module)]))
+            return nn.Sequential(
+                OrderedDict([("downsample", downsample_module), ("stage", stage_module)])
+            )
 
         stages = [
-            Conv(in_channels, width, kernel_size=3, stride=2, activation=activation, norm=normalization),
+            Conv(
+                in_channels,
+                width,
+                kernel_size=3,
+                stride=2,
+                activation=activation,
+                norm=normalization,
+            ),
             stage(width * 2, False),
             stage(width * 4, True),
             stage(width * 8, True),
@@ -564,10 +668,24 @@ def __init__(
             raise ValueError("Width and depth has to be given for an equal number of stages.")
 
         def conv3x3(in_channels: int, out_channels: int) -> nn.Module:
-            return Conv(in_channels, out_channels, kernel_size=3, stride=1, activation=activation, norm=normalization)
+            return Conv(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=1,
+                activation=activation,
+                norm=normalization,
+            )
 
         def downsample(in_channels: int, out_channels: int) -> nn.Module:
-            return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
+            return Conv(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=2,
+                activation=activation,
+                norm=normalization,
+            )
 
         def stage(in_channels: int, out_channels: int, depth: int) -> nn.Module:
             csp = CSPStage(
@@ -587,7 +705,9 @@ def stage(in_channels: int, out_channels: int, depth: int) -> nn.Module:
                 )
             )
 
-        convs = [conv3x3(in_channels, widths[0])] + [conv3x3(widths[0], widths[0]) for _ in range(depths[0] - 1)]
+        convs = [conv3x3(in_channels, widths[0])] + [
+            conv3x3(widths[0], widths[0]) for _ in range(depths[0] - 1)
+        ]
         self.stem = nn.Sequential(*convs)
         self.stages = nn.ModuleList(
             stage(in_channels, out_channels, depth)
@@ -630,7 +750,12 @@ def __init__(
 
         def downsample(in_channels: int, out_channels: int, kernel_size: int = 3) -> nn.Module:
             return Conv(
-                in_channels, out_channels, kernel_size=kernel_size, stride=2, activation=activation, norm=normalization
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=2,
+                activation=activation,
+                norm=normalization,
             )
 
         def stage(in_channels: int, out_channels: int, depth: int) -> nn.Module:
@@ -670,7 +795,8 @@ def forward(self, x: Tensor) -> List[Tensor]:
 
 
 class YOLOV7Backbone(nn.Module):
-    """A backbone that corresponds to the W6 variant of the Efficient Layer Aggregation Network from YOLOv7.
+    """A backbone that corresponds to the W6 variant of the Efficient Layer Aggregation Network
+    from YOLOv7.
 
     Args:
         in_channels: Number of channels in the input image.
@@ -695,10 +821,24 @@ def __init__(
         super().__init__()
 
         def conv3x3(in_channels: int, out_channels: int) -> nn.Module:
-            return Conv(in_channels, out_channels, kernel_size=3, stride=1, activation=activation, norm=normalization)
+            return Conv(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=1,
+                activation=activation,
+                norm=normalization,
+            )
 
         def downsample(in_channels: int, out_channels: int) -> nn.Module:
-            return Conv(in_channels, out_channels, kernel_size=3, stride=2, activation=activation, norm=normalization)
+            return Conv(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=2,
+                activation=activation,
+                norm=normalization,
+            )
 
         def stage(in_channels: int, out_channels: int) -> nn.Module:
             elan = ELANStage(
@@ -720,7 +860,8 @@ def stage(in_channels: int, out_channels: int) -> nn.Module:
 
         self.stem = nn.Sequential(*[ReOrg(), conv3x3(in_channels * 4, widths[0])])
         self.stages = nn.ModuleList(
-            stage(in_channels, out_channels) for in_channels, out_channels in zip(widths[:-1], widths[1:])
+            stage(in_channels, out_channels)
+            for in_channels, out_channels in zip(widths[:-1], widths[1:])
         )
 
     def forward(self, x: Tensor) -> List[Tensor]:
diff --git a/mart/nn/nn.py b/mart/nn/nn.py
index 9b3310ec..51611837 100644
--- a/mart/nn/nn.py
+++ b/mart/nn/nn.py
@@ -13,7 +13,15 @@
 
 import torch
 
-__all__ = ["GroupNorm32", "SequentialDict", "ReturnKwargs", "CallWith", "Sum", "TotalVariation", "EmptyTargets"]
+__all__ = [
+    "GroupNorm32",
+    "SequentialDict",
+    "ReturnKwargs",
+    "CallWith",
+    "Sum",
+    "TotalVariation",
+    "EmptyTargets",
+]
 
 logger = logging.getLogger(__name__)
 
@@ -178,7 +186,12 @@ def __call__(
 
             # Extend args with selected kwargs using arg_keys
             try:
-                args.extend([kwargs[kwargs_key] if isinstance(kwargs_key, str) else kwargs_key for kwargs_key in arg_keys])
+                args.extend(
+                    [
+                        kwargs[kwargs_key] if isinstance(kwargs_key, str) else kwargs_key
+                        for kwargs_key in arg_keys
+                    ]
+                )
             except KeyError as ex:
                 raise Exception(
                     f"{module_name} only received kwargs: {', '.join(kwargs.keys())}."
@@ -186,7 +199,10 @@ def __call__(
 
             # Replace kwargs with selected kwargs using kwarg_keys
             try:
-                kwargs = {name: kwargs[kwargs_key] if isinstance(kwargs_key, str) else kwargs_key for name, kwargs_key in kwarg_keys.items()}
+                kwargs = {
+                    name: kwargs[kwargs_key] if isinstance(kwargs_key, str) else kwargs_key
+                    for name, kwargs_key in kwarg_keys.items()
+                }
             except KeyError as ex:
                 raise Exception(
                     f"{module_name} only received kwargs: {', '.join(kwargs.keys())}."
@@ -312,9 +328,7 @@ def forward(self, *values, weights=None):
     def _total_variation(self, image):
         return torch.mean(
             torch.sum(torch.square(image[:, 1:, :] - image[:, :-1, :]))
-            + torch.sum(  # noqa: W503
-                torch.square(image[:, :, 1:] - image[:, :, :-1])
-            )
+            + torch.sum(torch.square(image[:, :, 1:] - image[:, :, :-1]))  # noqa: W503
         )
 
 
@@ -324,5 +338,6 @@ def forward(self, targets):
             {
                 "boxes": torch.empty((0, 4), device=t["boxes"].device),
                 "labels": torch.empty(0, dtype=torch.int64, device=t["labels"].device),
-            } for t in targets
+            }
+            for t in targets
         ]

From 02d30becea4f7b4f1a8adf9a5601150ce4f3f8d1 Mon Sep 17 00:00:00 2001
From: Cory Cornelius <cory.cornelius@intel.com>
Date: Mon, 26 Jun 2023 09:33:26 -0700
Subject: [PATCH 21/23] _train_mode_ can intefer with eval mode callback

---
 mart/configs/model/yolo.yaml  | 3 ---
 mart/models/detection/yolo.py | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/mart/configs/model/yolo.yaml b/mart/configs/model/yolo.yaml
index 83818ed7..8b1454b7 100644
--- a/mart/configs/model/yolo.yaml
+++ b/mart/configs/model/yolo.yaml
@@ -23,8 +23,6 @@ training_sequence:
   seq010:
     yolo:
       images: "input"
-      _train_mode_: False
-      _inference_mode_: True
 
   seq020:
     yolo.process_detections:
@@ -52,7 +50,6 @@ validation_sequence:
       _name_: "losses"
       images: "input"
       targets: "target.list_of_targets"
-      _train_mode_: False
   seq100: ${..training_sequence.seq100}
 
 test_sequence:
diff --git a/mart/models/detection/yolo.py b/mart/models/detection/yolo.py
index ba580616..39ffd78a 100644
--- a/mart/models/detection/yolo.py
+++ b/mart/models/detection/yolo.py
@@ -133,7 +133,7 @@ def forward(
             detection layer is the feature map size (width * height) times the number of anchors per cell (usually 3 or
             4). The predicted box coordinates are in `(x1, y1, x2, y2)` format and scaled to the input image size.
         """
-        self.validate_batch(images, targets)
+        #self.validate_batch(images, targets)
         images_tensor = images if isinstance(images, Tensor) else torch.stack(images)
         detections, losses, hits = self.network(images_tensor, targets)
 

From 0e4cfc3b7e8d0ffcccee20c19247c00d50b5dc5c Mon Sep 17 00:00:00 2001
From: Cory Cornelius <cory.cornelius@intel.com>
Date: Mon, 26 Jun 2023 13:49:40 -0700
Subject: [PATCH 22/23] Make YOLO model return detections and losses

---
 .../experiment/COCO_YOLOv3_ShapeShifter.yaml  | 12 +------
 mart/configs/model/yolo.yaml                  | 32 +++++++------------
 mart/models/detection/yolo.py                 |  6 ++--
 3 files changed, 14 insertions(+), 36 deletions(-)

diff --git a/mart/configs/experiment/COCO_YOLOv3_ShapeShifter.yaml b/mart/configs/experiment/COCO_YOLOv3_ShapeShifter.yaml
index ad294e42..d14f7f58 100644
--- a/mart/configs/experiment/COCO_YOLOv3_ShapeShifter.yaml
+++ b/mart/configs/experiment/COCO_YOLOv3_ShapeShifter.yaml
@@ -117,9 +117,6 @@ model:
     seq005: "perturbation"
     seq006: "input_adv"
     seq010:
-      yolo:
-        images: "input_adv"
-    seq030:
       yolo:
         images: "input_adv"
         targets: "empty_targets"
@@ -130,7 +127,7 @@ model:
     seq100:
       loss:
         _call_with_args_:
-          - "losses.confidence"
+          - "yolo.confidence"
           - "total_variation"
         weights:
           - 1
@@ -144,15 +141,8 @@ model:
     seq004: ${..training_sequence.seq004}
     seq005: ${..training_sequence.seq005}
     seq006: ${..training_sequence.seq006}
-    seq010: ${..training_sequence.seq010}
-    seq030:
-      yolo:
-        images: "input_adv"
-        targets: "empty_targets"
     seq050: ${..training_sequence.seq050}
-    seq100: ${..training_sequence.seq100}
 
   test_sequence:
     seq005: ${..training_sequence.seq005}
     seq006: ${..training_sequence.seq006}
-    seq010: ${..training_sequence.seq010}
diff --git a/mart/configs/model/yolo.yaml b/mart/configs/model/yolo.yaml
index 8b1454b7..cdd31163 100644
--- a/mart/configs/model/yolo.yaml
+++ b/mart/configs/model/yolo.yaml
@@ -23,33 +23,23 @@ training_sequence:
   seq010:
     yolo:
       images: "input"
+      targets: "target.list_of_targets"
 
   seq020:
     yolo.process_detections:
       _name_: "preds"
-      preds: "yolo"
-
-  seq030:
-    yolo:
-      _name_: "losses"
-      images: "input"
-      targets: "target.list_of_targets"
+      preds: "yolo.detections"
 
   seq100:
     loss:
       _call_with_args_:
-        - "losses.overlap"
-        - "losses.confidence"
-        - "losses.classification"
+        - "yolo.overlap"
+        - "yolo.confidence"
+        - "yolo.classification"
 
 validation_sequence:
   seq010: ${..training_sequence.seq010}
   seq020: ${..training_sequence.seq020}
-  seq030:
-    yolo:
-      _name_: "losses"
-      images: "input"
-      targets: "target.list_of_targets"
   seq100: ${..training_sequence.seq100}
 
 test_sequence:
@@ -58,14 +48,14 @@ test_sequence:
 
 training_step_log:
   loss: "loss"
-  loss_overlap: "losses.overlap"
-  loss_confidence: "losses.confidence"
-  loss_classification: "losses.classification"
+  loss_overlap: "yolo.overlap"
+  loss_confidence: "yolo.confidence"
+  loss_classification: "yolo.classification"
 
 validation_step_log:
   loss: "loss"
-  loss_overlap: "losses.overlap"
-  loss_confidence: "losses.confidence"
-  loss_classification: "losses.classification"
+  loss_overlap: "yolo.overlap"
+  loss_confidence: "yolo.confidence"
+  loss_classification: "yolo.classification"
 
 test_step_log: null
diff --git a/mart/models/detection/yolo.py b/mart/models/detection/yolo.py
index 39ffd78a..dc9e6c32 100644
--- a/mart/models/detection/yolo.py
+++ b/mart/models/detection/yolo.py
@@ -137,12 +137,10 @@ def forward(
         images_tensor = images if isinstance(images, Tensor) else torch.stack(images)
         detections, losses, hits = self.network(images_tensor, targets)
 
-        if targets is None:
-            detections = torch.cat(detections, 1)
-            return detections
+        detections = torch.cat(detections, 1)
 
         losses = torch.stack(losses).sum(0)
-        return {"overlap": losses[0], "confidence": losses[1], "classification": losses[2]}
+        return {"detections": detections, "overlap": losses[0], "confidence": losses[1], "classification": losses[2]}
 
     def infer(self, image: Tensor) -> PRED:
         """Feeds an image to the network and returns the detected bounding boxes, confidence

From eacd35538546be883f8c539406443a12f272e78d Mon Sep 17 00:00:00 2001
From: Cory Cornelius <cory.cornelius@intel.com>
Date: Mon, 26 Jun 2023 14:11:48 -0700
Subject: [PATCH 23/23] bugfix

---
 mart/configs/experiment/COCO_YOLOv3_ShapeShifter.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mart/configs/experiment/COCO_YOLOv3_ShapeShifter.yaml b/mart/configs/experiment/COCO_YOLOv3_ShapeShifter.yaml
index d14f7f58..0b09ab99 100644
--- a/mart/configs/experiment/COCO_YOLOv3_ShapeShifter.yaml
+++ b/mart/configs/experiment/COCO_YOLOv3_ShapeShifter.yaml
@@ -144,5 +144,6 @@ model:
     seq050: ${..training_sequence.seq050}
 
   test_sequence:
+    seq004: ${..training_sequence.seq004}
     seq005: ${..training_sequence.seq005}
     seq006: ${..training_sequence.seq006}