diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..dfe0770
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+# Auto detect text files and perform LF normalization
+* text=auto
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e8ea998
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,20 @@
+*.pyc
+checkpoints
+data
+logs
+.DS_Store
+.ipynb_checkpoints
+plots_paper_figures.py
+plot_results.ipynb
+plots_paper_figures_iccv.py
+plot/
+slurm/
+results/
+ProcessWebVision.ipynb
+results
+configs/webvision_full
+configs/webvision_imagenet
+configs/clothing1m
+script/WebVisionFull.slurm
+script/WebVisionFull_ImageNet.slurm
+script/Clothing1M.slurm
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..6b7ecd1
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020 HanxunHuangLemonBear
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
index 3160e8f..ef3eeaf 100644
--- a/README.md
+++ b/README.md
@@ -1,55 +1,68 @@
-### Code for ICML 2018 paper "Dimensionality-Driven Learning with Noisy Labels".
+# Normalized Loss Functions - Active Passive Losses
+Code for ICML2020 Paper ["Normalized Loss Functions for Deep Learning with Noisy Labels"](https://arxiv.org/abs/2006.13554)
 
-#### - Update (2018.07): Issues fixed on CIFAR-10. 
-#### - Update (2019.10): Start training with symmetric cross entropy (SCE) loss (replacing cross entropy).
+## Requirements
+```console
+Python >= 3.6, PyTorch >= 1.3.1, torchvision >= 0.4.1, mlconfig
+```
 
-The Symmetric Cross Entropy (SCE) was demonstrated can improve several exisiting methods including the D2L:
-ICCV2019 "Symmetric Cross Entropy for Robust Learning with Noisy Labels"
-https://arxiv.org/abs/1908.06112
-https://github.com/YisenWang/symmetric_cross_entropy_for_noisy_labels
+## How To Run
+##### Configs for the experiment settings
+Check '*.yaml' file in the config folder for each experiment.
 
-#### - Update (2020.03): convergence issue on CIFAR-100 when using SCE loss: learning rate, data augmentation and parameters for SCE. 
+##### Arguments
+* noise_rate: noise rate
+* asym: use if it is asymmetric noise, default is symmetric
+* config_path: path to the configs folder
+* version: the config file name
+* exp_name: name of the experiments (as note)
+* seed: random seed
 
+Example for 0.4 Symmetric noise rate with NCE+RCE loss
+```console
+# CIFAR-10
+$  python3  main.py --exp_name      test_exp            \
+                    --noise_rate    0.4                 \
+                    --version       nce+rce             \
+                    --config_path   configs/cifar10/sym \
+                    --seed          123                
+                    
 
-### 1. Train DNN models using command line:
+# CIFAR-100
+$  python3  main.py --exp_name      test_exp             \
+                    --noise_rate    0.4                  \
+                    --version       nce+rce              \
+                    --config_path   configs/cifar100/sym \
+                    --seed          123
+```
+Example for ploting lid_trend_through_training of 0.4 Symmetric noise rate with D2L Learning
+```console
+# CIFAR-10
+$  python3  main.py --exp_name      test_exp            \
+                    --noise_rate    0.4                 \
+                    --version       d2l             \
+                    --config_path   configs/cifar10/sym \
+                    --seed          123                 \
+                    --plot
 
-An example: <br/>
+Example for ploting The LID、Accuracy、CSR trend of different learning models throughout of 0.4 Symmetric noise rate
+```console
+# CIFAR-10
+$  python3  main.py --exp_name      test_exp            \
+                    --noise_rate    0.4                 \
+                    --config_path   configs/cifar10/sym \
+                    --seed          123                 \
+                    --plotall
 
-```
-python train_model.py -d mnist -m d2l -e 50 -b 128 -r 40 
-```
+## Citing this work
+If you use this code in your work, please cite the accompanying paper:
 
-`-d`: dataset in ['mnist', 'svhn', 'cifar-10', 'cifar-100'] <br/>
-`-m`: model in ['ce', 'forward', 'backward', 'boot_hard', 'boot_soft', 'd2l'] <br/>
-`-e`: epoch, `-b`: batch size, `-r`: noise rate in [0, 100] <br/> 
-
-
-### 2. Run with pre-set parameters in main function of train_model.py:
-```python
-    # mnist example
-    args = parser.parse_args(['-d', 'mnist', '-m', 'd2l',
-                              '-e', '50', '-b', '128',
-                              '-r', '40'])
-    main(args)
-    
-    # svhn example
-    args = parser.parse_args(['-d', 'svhn', '-m', 'd2l',
-                              '-e', '50', '-b', '128',
-                              '-r', '40'])
-    main(args)
-    
-    # cifar-10 example
-    args = parser.parse_args(['-d', 'cifar-10', '-m', 'd2l',
-                              '-e', '120', '-b', '128',
-                              '-r', '40'])
-    main(args)
-    
-    # cifar-100 example
-    args = parser.parse_args(['-d', 'cifar-100', '-m', 'd2l',
-                              '-e', '200', '-b', '128',
-                              '-r', '40'])
-    main(args)
+```
+@inproceedings{ma2020normalized,
+  title={Normalized Loss Functions for Deep Learning with Noisy Labels},
+  author={Ma, Xingjun and Huang, Hanxun and Wang, Yisen and Romano, Simone and Erfani, Sarah and Bailey, James},
+  booktitle={ICML},
+  year={2020}
+}
 ```
 
-#### Requirements:
-tensorflow, Keras, numpy, scipy, sklearn, matplotlib
diff --git a/archive/dataset.py b/archive/dataset.py
new file mode 100644
index 0000000..adbf5b0
--- /dev/null
+++ b/archive/dataset.py
@@ -0,0 +1,586 @@
+from torchvision import datasets, transforms
+from torch.utils.data import DataLoader
+from PIL import Image
+from tqdm import tqdm
+from numpy.testing import assert_array_almost_equal
+import numpy as np
+import os
+import torch
+import random
+import collections
+
+
+def build_for_cifar100(size, noise):
+    """ random flip between two random classes.
+    """
+    assert(noise >= 0.) and (noise <= 1.)
+
+    P = (1. - noise) * np.eye(size)
+    for i in np.arange(size - 1):
+        P[i, i+1] = noise
+
+    # adjust last row
+    P[size-1, 0] = noise
+
+    assert_array_almost_equal(P.sum(axis=1), 1, 1)
+    return P
+
+
+def multiclass_noisify(y, P, random_state=0):
+    """ Flip classes according to transition probability matrix T.
+    It expects a number between 0 and the number of classes - 1.
+    """
+
+    assert P.shape[0] == P.shape[1]
+    assert np.max(y) < P.shape[0]
+
+    # row stochastic matrix
+    assert_array_almost_equal(P.sum(axis=1), np.ones(P.shape[1]))
+    assert (P >= 0.0).all()
+
+    m = y.shape[0]
+    new_y = y.copy()
+    flipper = np.random.RandomState(random_state)
+
+    for idx in np.arange(m):
+        i = y[idx]
+        # draw a vector with only an 1
+        flipped = flipper.multinomial(1, P[i, :], 1)[0]
+        new_y[idx] = np.where(flipped == 1)[0]
+
+    return new_y
+
+
+def other_class(n_classes, current_class):
+    """
+    Returns a list of class indices excluding the class indexed by class_ind
+    :param nb_classes: number of classes in the task
+    :param class_ind: the class index to be omitted
+    :return: one random class that != class_ind
+    """
+    if current_class < 0 or current_class >= n_classes:
+        error_str = "class_ind must be within the range (0, nb_classes - 1)"
+        raise ValueError(error_str)
+
+    other_class_list = list(range(n_classes))
+    other_class_list.remove(current_class)
+    other_class = np.random.choice(other_class_list)
+    return other_class
+
+
+class MNISTNoisy(datasets.MNIST):
+    def __init__(self, root, train=True, transform=None, target_transform=None, download=True, nosiy_rate=0.0, asym=False, seed=0):
+        super(MNISTNoisy, self).__init__(root, transform=transform, target_transform=target_transform, download=download)
+        self.targets = self.targets.numpy()
+        if asym:
+            P = np.eye(10)
+            n = nosiy_rate
+
+            P[7, 7], P[7, 1] = 1. - n, n
+            # 2 -> 7
+            P[2, 2], P[2, 7] = 1. - n, n
+
+            # 5 <-> 6
+            P[5, 5], P[5, 6] = 1. - n, n
+            P[6, 6], P[6, 5] = 1. - n, n
+
+            # 3 -> 8
+            P[3, 3], P[3, 8] = 1. - n, n
+
+            y_train_noisy = multiclass_noisify(self.targets, P=P, random_state=seed)
+            actual_noise = (y_train_noisy != self.targets).mean()
+            assert actual_noise > 0.0
+            print('Actual noise %.2f' % actual_noise)
+            self.targets = y_train_noisy
+
+        else:
+            n_samples = len(self.targets)
+            n_noisy = int(nosiy_rate * n_samples)
+            print("%d Noisy samples" % (n_noisy))
+            class_index = [np.where(np.array(self.targets) == i)[0] for i in range(10)]
+            class_noisy = int(n_noisy / 10)
+            noisy_idx = []
+            for d in range(10):
+                noisy_class_index = np.random.choice(class_index[d], class_noisy, replace=False)
+                noisy_idx.extend(noisy_class_index)
+                print("Class %d, number of noisy % d" % (d, len(noisy_class_index)))
+            for i in noisy_idx:
+                self.targets[i] = other_class(n_classes=10, current_class=self.targets[i])
+            print(len(noisy_idx))
+
+        print("Print noisy label generation statistics:")
+        for i in range(10):
+            n_noisy = np.sum(np.array(self.targets) == i)
+            print("Noisy class %s, has %s samples." % (i, n_noisy))
+
+        return
+
+
+class cifar10Nosiy(datasets.CIFAR10):
+    def __init__(self, root, train=True, transform=None, target_transform=None, download=True, nosiy_rate=0.0, asym=False):
+        super(cifar10Nosiy, self).__init__(root, transform=transform, target_transform=target_transform)
+        if asym:
+            # automobile < - truck, bird -> airplane, cat <-> dog, deer -> horse
+            source_class = [9, 2, 3, 5, 4]
+            target_class = [1, 0, 5, 3, 7]
+            for s, t in zip(source_class, target_class):
+                cls_idx = np.where(np.array(self.targets) == s)[0]
+                n_noisy = int(nosiy_rate * cls_idx.shape[0])
+                noisy_sample_index = np.random.choice(cls_idx, n_noisy, replace=False)
+                for idx in noisy_sample_index:
+                    self.targets[idx] = t
+            return
+        elif nosiy_rate > 0:
+            n_samples = len(self.targets)
+            n_noisy = int(nosiy_rate * n_samples)
+            print("%d Noisy samples" % (n_noisy))
+            class_index = [np.where(np.array(self.targets) == i)[0] for i in range(10)]
+            class_noisy = int(n_noisy / 10)
+            noisy_idx = []
+            for d in range(10):
+                noisy_class_index = np.random.choice(class_index[d], class_noisy, replace=False)
+                noisy_idx.extend(noisy_class_index)
+                print("Class %d, number of noisy % d" % (d, len(noisy_class_index)))
+            for i in noisy_idx:
+                self.targets[i] = other_class(n_classes=10, current_class=self.targets[i])
+            print(len(noisy_idx))
+            print("Print noisy label generation statistics:")
+            for i in range(10):
+                n_noisy = np.sum(np.array(self.targets) == i)
+                print("Noisy class %s, has %s samples." % (i, n_noisy))
+            return
+
+
+class cifar100Nosiy(datasets.CIFAR100):
+    def __init__(self, root, train=True, transform=None, target_transform=None, download=False, nosiy_rate=0.0, asym=False, seed=0):
+        super(cifar100Nosiy, self).__init__(root, download=download, transform=transform, target_transform=target_transform)
+        if asym:
+            """mistakes are inside the same superclass of 10 classes, e.g. 'fish'
+            """
+            nb_classes = 100
+            P = np.eye(nb_classes)
+            n = nosiy_rate
+            nb_superclasses = 20
+            nb_subclasses = 5
+
+            if n > 0.0:
+                for i in np.arange(nb_superclasses):
+                    init, end = i * nb_subclasses, (i+1) * nb_subclasses
+                    P[init:end, init:end] = build_for_cifar100(nb_subclasses, n)
+
+                    y_train_noisy = multiclass_noisify(np.array(self.targets), P=P, random_state=seed)
+                    actual_noise = (y_train_noisy != np.array(self.targets)).mean()
+                assert actual_noise > 0.0
+                print('Actual noise %.2f' % actual_noise)
+                self.targets = y_train_noisy.tolist()
+            return
+        elif nosiy_rate > 0:
+            n_samples = len(self.targets)
+            n_noisy = int(nosiy_rate * n_samples)
+            print("%d Noisy samples" % (n_noisy))
+            class_index = [np.where(np.array(self.targets) == i)[0] for i in range(100)]
+            class_noisy = int(n_noisy / 100)
+            noisy_idx = []
+            for d in range(100):
+                noisy_class_index = np.random.choice(class_index[d], class_noisy, replace=False)
+                noisy_idx.extend(noisy_class_index)
+                print("Class %d, number of noisy % d" % (d, len(noisy_class_index)))
+            for i in noisy_idx:
+                self.targets[i] = other_class(n_classes=100, current_class=self.targets[i])
+            print(len(noisy_idx))
+            print("Print noisy label generation statistics:")
+            for i in range(100):
+                n_noisy = np.sum(np.array(self.targets) == i)
+                print("Noisy class %s, has %s samples." % (i, n_noisy))
+            return
+
+
+class DatasetGenerator():
+    def __init__(self,
+                 batchSize=128,
+                 eval_batch_size=256,
+                 dataPath='data/',
+                 seed=123,
+                 numOfWorkers=4,
+                 asym=False,
+                 dataset_type='cifar10',
+                 is_cifar100=False,
+                 cutout_length=16,
+                 noise_rate=0.4):
+        self.seed = seed
+        np.random.seed(seed)
+        self.batchSize = batchSize
+        self.eval_batch_size = eval_batch_size
+        self.dataPath = dataPath
+        self.numOfWorkers = numOfWorkers
+        self.cutout_length = cutout_length
+        self.noise_rate = noise_rate
+        self.dataset_type = dataset_type
+        self.asym = asym
+        self.data_loaders = self.loadData()
+        return
+
+    def getDataLoader(self):
+        return self.data_loaders
+
+    def loadData(self):
+        if self.dataset_type == 'mnist':
+            MEAN = [0.1307]
+            STD = [0.3081]
+            train_transform = transforms.Compose([
+                transforms.ToTensor(),
+                transforms.Normalize(MEAN, STD)])
+
+            test_transform = transforms.Compose([
+                transforms.ToTensor(),
+                transforms.Normalize(MEAN, STD)])
+
+            train_dataset = MNISTNoisy(root=self.dataPath,
+                                       train=True,
+                                       transform=train_transform,
+                                       download=True,
+                                       asym=self.asym,
+                                       seed=self.seed,
+                                       nosiy_rate=self.noise_rate)
+
+            test_dataset = datasets.MNIST(root=self.dataPath,
+                                          train=False,
+                                          transform=test_transform,
+                                          download=True)
+
+        elif self.dataset_type == 'cifar100':
+            CIFAR_MEAN = [0.5071, 0.4865, 0.4409]
+            CIFAR_STD = [0.2673, 0.2564, 0.2762]
+
+            train_transform = transforms.Compose([
+                transforms.RandomCrop(32, padding=4),
+                transforms.RandomHorizontalFlip(),
+                transforms.RandomRotation(20),
+                transforms.ToTensor(),
+                transforms.Normalize(CIFAR_MEAN, CIFAR_STD)])
+
+            test_transform = transforms.Compose([
+                transforms.ToTensor(),
+                transforms.Normalize(CIFAR_MEAN, CIFAR_STD)])
+
+            train_dataset = cifar100Nosiy(root=self.dataPath,
+                                          train=True,
+                                          transform=train_transform,
+                                          download=True,
+                                          asym=self.asym,
+                                          seed=self.seed,
+                                          nosiy_rate=self.noise_rate)
+
+            test_dataset = datasets.CIFAR100(root=self.dataPath,
+                                             train=False,
+                                             transform=test_transform,
+                                             download=True)
+
+        elif self.dataset_type == 'cifar10':
+            CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124]
+            CIFAR_STD = [0.24703233, 0.24348505, 0.26158768]
+
+            train_transform = transforms.Compose([
+                transforms.RandomCrop(32, padding=4),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor(),
+                transforms.Normalize(CIFAR_MEAN, CIFAR_STD)])
+
+            test_transform = transforms.Compose([
+                transforms.ToTensor(),
+                transforms.Normalize(CIFAR_MEAN, CIFAR_STD)])
+
+            train_dataset = cifar10Nosiy(root=self.dataPath,
+                                         train=True,
+                                         transform=train_transform,
+                                         download=True,
+                                         asym=self.asym,
+                                         nosiy_rate=self.noise_rate)
+
+            test_dataset = datasets.CIFAR10(root=self.dataPath,
+                                            train=False,
+                                            transform=test_transform,
+                                            download=True)
+        else:
+            raise("Unknown Dataset")
+
+        data_loaders = {}
+
+        data_loaders['train_dataset'] = DataLoader(dataset=train_dataset,
+                                                   batch_size=self.batchSize,
+                                                   shuffle=True,
+                                                   pin_memory=True,
+                                                   num_workers=self.numOfWorkers)
+
+        data_loaders['test_dataset'] = DataLoader(dataset=test_dataset,
+                                                  batch_size=self.eval_batch_size,
+                                                  shuffle=False,
+                                                  pin_memory=True,
+                                                  num_workers=self.numOfWorkers)
+
+        print("Num of train %d" % (len(train_dataset)))
+        print("Num of test %d" % (len(test_dataset)))
+
+        return data_loaders
+
+
+class Clothing1MDataset:
+    def __init__(self, path, type='train', transform=None, target_transform=None):
+        self.path = path
+        if type == 'test':
+            flist = os.path.join(path, "annotations/clean_test.txt")
+        elif type == 'valid':
+            flist = os.path.join(path, "annotations/clean_val.txt")
+        elif type == 'train':
+            flist = os.path.join(path, "annotations/noisy_train.txt")
+        else:
+            raise('Unknown type')
+
+        self.imlist = self.flist_reader(flist)
+        self.transform = transform
+
+    def __len__(self):
+        return len(self.imlist)
+
+    def __getitem__(self, index):
+        impath, target = self.imlist[index]
+        img = Image.open(impath).convert("RGB")
+        if self.transform is not None:
+            img = self.transform(img)
+        return img, target
+
+    def flist_reader(self, flist):
+        imlist = []
+        with open(flist, 'r') as rf:
+            for line in rf.readlines():
+                row = line.split(" ")
+                impath = self.path + row[0]
+                imlabel = row[1]
+                imlist.append((impath, int(imlabel)))
+        return imlist
+
+
+class Clothing1MDatasetLoader:
+    def __init__(self, batchSize=128, eval_batch_size=256, dataPath='data/', numOfWorkers=4):
+        self.batchSize = batchSize
+        self.eval_batch_size = eval_batch_size
+        self.dataPath = dataPath
+        self.numOfWorkers = numOfWorkers
+        self.data_loaders = self.loadData()
+
+    def getDataLoader(self):
+        return self.data_loaders
+
+    def loadData(self):
+        MEAN = [0.6959, 0.6537, 0.6371]
+        STD = [0.3113, 0.3192, 0.3214]
+        train_transform = transforms.Compose([
+            transforms.RandomResizedCrop(224),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=MEAN, std=STD),
+         ])
+        test_transform = transforms.Compose([
+            transforms.Resize((224, 224)),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=MEAN, std=STD)
+        ])
+
+        train_dataset = Clothing1MDataset(path=self.dataPath,
+                                          type='train',
+                                          transform=train_transform)
+
+        test_dataset = Clothing1MDataset(path=self.dataPath,
+                                         type='test',
+                                         transform=test_transform)
+
+        valid_dataset = Clothing1MDataset(path=self.dataPath,
+                                          type='valid',
+                                          transform=test_transform)
+
+        data_loaders = {}
+
+        data_loaders['train_dataset'] = DataLoader(dataset=train_dataset,
+                                                   batch_size=self.batchSize,
+                                                   shuffle=True,
+                                                   pin_memory=True,
+                                                   num_workers=self.numOfWorkers)
+
+        data_loaders['test_dataset'] = DataLoader(dataset=test_dataset,
+                                                  batch_size=self.eval_batch_size,
+                                                  shuffle=False,
+                                                  pin_memory=True,
+                                                  num_workers=self.numOfWorkers)
+
+        data_loaders['valid_dataset'] = DataLoader(dataset=valid_dataset,
+                                                   batch_size=self.eval_batch_size,
+                                                   shuffle=False,
+                                                   pin_memory=True,
+                                                   num_workers=self.numOfWorkers)
+        return data_loaders
+
+
+class NosieImageNet(datasets.ImageNet):
+    def __init__(self, root, split='train', seed=999, download=None, target_class_num=200, nosiy_rate=0.4, **kwargs):
+        super(NosieImageNet, self).__init__(root, download=download, split=split, **kwargs)
+        random.seed(seed)
+        np.random.seed(seed)
+        self.new_idx = random.sample(list(range(0, 1000)), k=target_class_num)
+        print(len(self.new_idx), len(self.imgs))
+        self.new_imgs = []
+        self.new_targets = []
+
+        for file, cls_id in self.imgs:
+            if cls_id in self.new_idx:
+                new_idx = self.new_idx.index(cls_id)
+                self.new_imgs.append((file, new_idx))
+                self.new_targets.append(new_idx)
+        self.imgs = self.new_imgs
+        self.targets = self.new_targets
+        print(min(self.targets), max(self.targets))
+        # Noise
+        if split == 'train':
+            n_samples = len(self.targets)
+            n_noisy = int(nosiy_rate * n_samples)
+            print("%d Noisy samples" % (n_noisy))
+            class_index = [np.where(np.array(self.targets) == i)[0] for i in range(target_class_num)]
+            class_noisy = int(n_noisy / target_class_num)
+            noisy_idx = []
+            for d in range(target_class_num):
+                print(len(class_index[d]), d)
+                noisy_class_index = np.random.choice(class_index[d], class_noisy, replace=False)
+                noisy_idx.extend(noisy_class_index)
+                print("Class %d, number of noisy % d" % (d, len(noisy_class_index)))
+            for i in noisy_idx:
+                self.targets[i] = other_class(n_classes=target_class_num, current_class=self.targets[i])
+                (file, old_idx) = self.imgs[i]
+                self.imgs[i] = (file, self.targets[i])
+            print(len(noisy_idx))
+            print("Print noisy label generation statistics:")
+            for i in range(target_class_num):
+                n_noisy = np.sum(np.array(self.targets) == i)
+                print("Noisy class %s, has %s samples." % (i, n_noisy))
+
+        self.samples = self.imgs
+
+
+class ImageNetDatasetLoader:
+    def __init__(self,
+                 batchSize=128,
+                 eval_batch_size=256,
+                 dataPath='data/',
+                 seed=999,
+                 target_class_num=200,
+                 nosiy_rate=0.4,
+                 numOfWorkers=4):
+        self.batchSize = batchSize
+        self.eval_batch_size = eval_batch_size
+        self.dataPath = dataPath
+        self.numOfWorkers = numOfWorkers
+        self.seed = seed
+        self.target_class_num = target_class_num
+        self.nosiy_rate = nosiy_rate
+        self.data_loaders = self.loadData()
+
+    def getDataLoader(self):
+        return self.data_loaders
+
+    def loadData(self):
+        IMAGENET_MEAN = [0.485, 0.456, 0.406]
+        IMAGENET_STD = [0.229, 0.224, 0.225]
+
+        train_transform = transforms.Compose([
+            transforms.RandomResizedCrop(224),
+            transforms.RandomHorizontalFlip(),
+            transforms.ColorJitter(brightness=0.4,
+                                   contrast=0.4,
+                                   saturation=0.4,
+                                   hue=0.2),
+            transforms.ToTensor(),
+            transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)])
+
+        test_transform = transforms.Compose([
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)])
+
+        train_dataset = NosieImageNet(root=self.dataPath,
+                                      split='train',
+                                      nosiy_rate=self.nosiy_rate,
+                                      target_class_num=self.target_class_num,
+                                      seed=self.seed,
+                                      transform=train_transform,
+                                      download=True)
+
+        test_dataset = NosieImageNet(root=self.dataPath,
+                                     split='val',
+                                     nosiy_rate=self.nosiy_rate,
+                                     target_class_num=self.target_class_num,
+                                     seed=self.seed,
+                                     transform=test_transform,
+                                     download=True)
+
+        data_loaders = {}
+
+        data_loaders['train_dataset'] = DataLoader(dataset=train_dataset,
+                                                   batch_size=self.batchSize,
+                                                   shuffle=True,
+                                                   pin_memory=True,
+                                                   num_workers=self.numOfWorkers)
+
+        data_loaders['test_dataset'] = DataLoader(dataset=test_dataset,
+                                                  batch_size=self.batchSize,
+                                                  shuffle=False,
+                                                  pin_memory=True,
+                                                  num_workers=self.numOfWorkers)
+        return data_loaders
+
+
+
+
+
+
+def online_mean_and_sd(loader):
+    """Compute the mean and sd in an online fashion
+
+        Var[x] = E[X^2] - E^2[X]
+    """
+    cnt = 0
+    fst_moment = torch.empty(3)
+    snd_moment = torch.empty(3)
+
+    for data, _ in tqdm(loader):
+
+        b, c, h, w = data.shape
+        nb_pixels = b * h * w
+        sum_ = torch.sum(data, dim=[0, 2, 3])
+        sum_of_square = torch.sum(data ** 2, dim=[0, 2, 3])
+        fst_moment = (cnt * fst_moment + sum_) / (cnt + nb_pixels)
+        snd_moment = (cnt * snd_moment + sum_of_square) / (cnt + nb_pixels)
+
+        cnt += nb_pixels
+
+    return fst_moment, torch.sqrt(snd_moment - fst_moment ** 2)
+
+
+if __name__ == '__main__':
+    # train_transform = transforms.Compose([
+    #     transforms.Resize((224, 224)),
+    #     transforms.ToTensor(),
+    # ])
+    # test = Clothing1MDataset(path='../datasets/clothing1M', transform=train_transform)
+    # loader = DataLoader(test,
+    #                     batch_size=128,
+    #                     num_workers=12,
+    #                     shuffle=True)
+    # mean, std = online_mean_and_sd(loader)
+    # print(mean)
+    # print(std)
+    #
+    # '''
+    #     tensor([0.7215, 0.6846, 0.6679])
+    #     tensor([0.3021, 0.3122, 0.3167])
+    # '''
+    train = NosieImageNet(root='../datasets/ILSVR2012', split='train')
+    valid = NosieImageNet(root='../datasets/ILSVR2012', split='val')
diff --git a/archive/loss.py b/archive/loss.py
new file mode 100644
index 0000000..3a83488
--- /dev/null
+++ b/archive/loss.py
@@ -0,0 +1,495 @@
+import torch
+import torch.nn.functional as F
+import numpy as np
+
+if torch.cuda.is_available():
+    torch.backends.cudnn.benchmark = True
+    if torch.cuda.device_count() > 1:
+        device = torch.device('cuda:0')
+    else:
+        device = torch.device('cuda')
+else:
+    device = torch.device('cpu')
+
+
+class SCELoss(torch.nn.Module):
+    def __init__(self, alpha, beta, num_classes=10):
+        super(SCELoss, self).__init__()
+        self.device = device
+        self.alpha = alpha
+        self.beta = beta
+        self.num_classes = num_classes
+        self.cross_entropy = torch.nn.CrossEntropyLoss()
+
+    def forward(self, pred, labels):
+        # CCE
+        ce = self.cross_entropy(pred, labels)
+
+        # RCE
+        pred = F.softmax(pred, dim=1)
+        pred = torch.clamp(pred, min=1e-7, max=1.0)
+        label_one_hot = torch.nn.functional.one_hot(labels, self.num_classes).float().to(self.device)
+        label_one_hot = torch.clamp(label_one_hot, min=1e-4, max=1.0)
+        rce = (-1*torch.sum(pred * torch.log(label_one_hot), dim=1))
+
+        # Loss
+        loss = self.alpha * ce + self.beta * rce.mean()
+        return loss
+
+
+class ReverseCrossEntropy(torch.nn.Module):
+    def __init__(self, num_classes, scale=1.0):
+        super(ReverseCrossEntropy, self).__init__()
+        self.device = device
+        self.num_classes = num_classes
+        self.scale = scale
+
+    def forward(self, pred, labels):
+        pred = F.softmax(pred, dim=1)
+        pred = torch.clamp(pred, min=1e-7, max=1.0)
+        label_one_hot = torch.nn.functional.one_hot(labels, self.num_classes).float().to(self.device)
+        label_one_hot = torch.clamp(label_one_hot, min=1e-4, max=1.0)
+        rce = (-1*torch.sum(pred * torch.log(label_one_hot), dim=1))
+        return self.scale * rce.mean()
+        normalizor = 1 / 4 * (self.num_classes - 1)
+        rce = (-1*torch.sum(pred * torch.log(label_one_hot), dim=1))
+        return self.scale * normalizor * rce.mean()
+
+
+class NormalizedReverseCrossEntropy(torch.nn.Module):
+    def __init__(self, num_classes, scale=1.0):
+        super(NormalizedReverseCrossEntropy, self).__init__()
+        self.device = device
+        self.num_classes = num_classes
+        self.scale = scale
+
+    def forward(self, pred, labels):
+        pred = F.softmax(pred, dim=1)
+        pred = torch.clamp(pred, min=1e-7, max=1.0)
+        label_one_hot = torch.nn.functional.one_hot(labels, self.num_classes).float().to(self.device)
+        label_one_hot = torch.clamp(label_one_hot, min=1e-4, max=1.0)
+        normalizor = 1 / 4 * (self.num_classes - 1)
+        rce = (-1*torch.sum(pred * torch.log(label_one_hot), dim=1))
+        return self.scale * normalizor * rce.mean()
+
+
+class NormalizedCrossEntropy(torch.nn.Module):
+    def __init__(self, num_classes, scale=1.0):
+        super(NormalizedCrossEntropy, self).__init__()
+        self.device = device
+        self.num_classes = num_classes
+        self.scale = scale
+
+    def forward(self, pred, labels):
+        pred = F.log_softmax(pred, dim=1)
+        label_one_hot = torch.nn.functional.one_hot(labels, self.num_classes).float().to(self.device)
+        nce = -1 * torch.sum(label_one_hot * pred, dim=1) / (- pred.sum(dim=1))
+        return self.scale * nce.mean()
+
+
+class GeneralizedCrossEntropy(torch.nn.Module):
+    def __init__(self, num_classes, q=0.7):
+        super(GeneralizedCrossEntropy, self).__init__()
+        self.device = device
+        self.num_classes = num_classes
+        self.q = q
+
+    def forward(self, pred, labels):
+        pred = F.softmax(pred, dim=1)
+        pred = torch.clamp(pred, min=1e-7, max=1.0)
+        label_one_hot = torch.nn.functional.one_hot(labels, self.num_classes).float().to(self.device)
+        gce = (1. - torch.pow(torch.sum(label_one_hot * pred, dim=1), self.q)) / self.q
+        return gce.mean()
+
+
+class NormalizedGeneralizedCrossEntropy(torch.nn.Module):
+    def __init__(self, num_classes, scale=1.0, q=0.7):
+        super(NormalizedGeneralizedCrossEntropy, self).__init__()
+        self.device = device
+        self.num_classes = num_classes
+        self.q = q
+        self.scale = scale
+
+    def forward(self, pred, labels):
+        pred = F.softmax(pred, dim=1)
+        pred = torch.clamp(pred, min=1e-7, max=1.0)
+        label_one_hot = torch.nn.functional.one_hot(labels, self.num_classes).float().to(self.device)
+        numerators = 1. - torch.pow(torch.sum(label_one_hot * pred, dim=1), self.q)
+        denominators = self.num_classes - pred.pow(self.q).sum(dim=1)
+        ngce = numerators / denominators
+        return self.scale * ngce.mean()
+
+
+class MeanAbsoluteError(torch.nn.Module):
+    def __init__(self, num_classes, scale=1.0):
+        super(MeanAbsoluteError, self).__init__()
+        self.device = device
+        self.num_classes = num_classes
+        self.scale = scale
+        return
+
+    def forward(self, pred, labels):
+        pred = F.softmax(pred, dim=1)
+        label_one_hot = torch.nn.functional.one_hot(labels, self.num_classes).float().to(self.device)
+        mae = 1. - torch.sum(label_one_hot * pred, dim=1)
+        return self.scale * mae.mean()
+
+
+class NormalizedMeanAbsoluteError(torch.nn.Module):
+    def __init__(self, num_classes, scale=1.0):
+        super(NormalizedMeanAbsoluteError, self).__init__()
+        self.device = device
+        self.num_classes = num_classes
+        self.scale = scale
+        return
+
+    def forward(self, pred, labels):
+        pred = F.softmax(pred, dim=1)
+        label_one_hot = torch.nn.functional.one_hot(labels, self.num_classes).float().to(self.device)
+        normalizor = 1 / (2 * (self.num_classes - 1))
+        mae = 1. - torch.sum(label_one_hot * pred, dim=1)
+        return self.scale * normalizor * mae.mean()
+
+
+class NCEandRCE(torch.nn.Module):
+    def __init__(self, alpha, beta, num_classes):
+        super(NCEandRCE, self).__init__()
+        self.num_classes = num_classes
+        self.nce = NormalizedCrossEntropy(scale=alpha, num_classes=num_classes)
+        self.rce = ReverseCrossEntropy(scale=beta, num_classes=num_classes)
+
+    def forward(self, pred, labels):
+        return self.nce(pred, labels) + self.rce(pred, labels)
+
+
+class NCEandMAE(torch.nn.Module):
+    def __init__(self, alpha, beta, num_classes):
+        super(NCEandMAE, self).__init__()
+        self.num_classes = num_classes
+        self.nce = NormalizedCrossEntropy(scale=alpha, num_classes=num_classes)
+        self.mae = MeanAbsoluteError(scale=beta, num_classes=num_classes)
+
+    def forward(self, pred, labels):
+        return self.nce(pred, labels) + self.mae(pred, labels)
+
+
+class GCEandMAE(torch.nn.Module):
+    def __init__(self, alpha, beta, num_classes, q=0.7):
+        super(GCEandMAE, self).__init__()
+        self.num_classes = num_classes
+        self.gce = GeneralizedCrossEntropy(num_classes=num_classes, q=q)
+        self.mae = MeanAbsoluteError(scale=beta, num_classes=num_classes)
+
+    def forward(self, pred, labels):
+        return self.gce(pred, labels) + self.mae(pred, labels)
+
+
+class GCEandRCE(torch.nn.Module):
+    def __init__(self, alpha, beta, num_classes, q=0.7):
+        super(GCEandRCE, self).__init__()
+        self.num_classes = num_classes
+        self.gce = GeneralizedCrossEntropy(num_classes=num_classes, q=q)
+        self.rce = ReverseCrossEntropy(scale=beta, num_classes=num_classes)
+
+    def forward(self, pred, labels):
+        return self.gce(pred, labels) + self.rce(pred, labels)
+
+
+class GCEandNCE(torch.nn.Module):
+    def __init__(self, alpha, beta, num_classes, q=0.7):
+        super(GCEandNCE, self).__init__()
+        self.num_classes = num_classes
+        self.gce = GeneralizedCrossEntropy(num_classes=num_classes, q=q)
+        self.nce = NormalizedCrossEntropy(num_classes=num_classes)
+
+    def forward(self, pred, labels):
+        return self.gce(pred, labels) + self.nce(pred, labels)
+
+
+class NGCEandNCE(torch.nn.Module):
+    def __init__(self, alpha, beta, num_classes, q=0.7):
+        super(NGCEandNCE, self).__init__()
+        self.num_classes = num_classes
+        self.ngce = NormalizedGeneralizedCrossEntropy(scale=alpha, q=q, num_classes=num_classes)
+        self.nce = NormalizedCrossEntropy(scale=beta, num_classes=num_classes)
+
+    def forward(self, pred, labels):
+        return self.ngce(pred, labels) + self.nce(pred, labels)
+
+
+class NGCEandMAE(torch.nn.Module):
+    def __init__(self, alpha, beta, num_classes, q=0.7):
+        super(NGCEandMAE, self).__init__()
+        self.num_classes = num_classes
+        self.ngce = NormalizedGeneralizedCrossEntropy(scale=alpha, q=q, num_classes=num_classes)
+        self.mae = MeanAbsoluteError(scale=beta, num_classes=num_classes)
+
+    def forward(self, pred, labels):
+        return self.ngce(pred, labels) + self.mae(pred, labels)
+
+
+class NGCEandRCE(torch.nn.Module):
+    def __init__(self, alpha, beta, num_classes, q=0.7):
+        super(NGCEandRCE, self).__init__()
+        self.num_classes = num_classes
+        self.ngce = NormalizedGeneralizedCrossEntropy(scale=alpha, q=q, num_classes=num_classes)
+        self.rce = ReverseCrossEntropy(scale=beta, num_classes=num_classes)
+
+    def forward(self, pred, labels):
+        return self.ngce(pred, labels) + self.rce(pred, labels)
+
+
+class MAEandRCE(torch.nn.Module):
+    def __init__(self, alpha, beta, num_classes):
+        super(MAEandRCE, self).__init__()
+        self.num_classes = num_classes
+        self.mae = MeanAbsoluteError(scale=alpha, num_classes=num_classes)
+        self.rce = ReverseCrossEntropy(scale=beta, num_classes=num_classes)
+
+    def forward(self, pred, labels):
+        return self.mae(pred, labels) + self.rce(pred, labels)
+
+
+class NLNL(torch.nn.Module):
+    def __init__(self, train_loader, num_classes, ln_neg=1):
+        super(NLNL, self).__init__()
+        self.device = device
+        self.num_classes = num_classes
+        self.ln_neg = ln_neg
+        weight = torch.FloatTensor(num_classes).zero_() + 1.
+        if not hasattr(train_loader.dataset, 'targets'):
+            weight = [1] * num_classes
+            weight = torch.FloatTensor(weight)
+        else:
+            for i in range(num_classes):
+                weight[i] = (torch.from_numpy(np.array(train_loader.dataset.targets)) == i).sum()
+            weight = 1 / (weight / weight.max())
+        self.weight = weight.to(self.device)
+        self.criterion = torch.nn.CrossEntropyLoss(weight=self.weight)
+        self.criterion_nll = torch.nn.NLLLoss()
+
+    def forward(self, pred, labels):
+        labels_neg = (labels.unsqueeze(-1).repeat(1, self.ln_neg)
+                      + torch.LongTensor(len(labels), self.ln_neg).to(self.device).random_(1, self.num_classes)) % self.num_classes
+        labels_neg = torch.autograd.Variable(labels_neg)
+
+        assert labels_neg.max() <= self.num_classes-1
+        assert labels_neg.min() >= 0
+        assert (labels_neg != labels.unsqueeze(-1).repeat(1, self.ln_neg)).sum() == len(labels)*self.ln_neg
+
+        s_neg = torch.log(torch.clamp(1. - F.softmax(pred, 1), min=1e-5, max=1.))
+        s_neg *= self.weight[labels].unsqueeze(-1).expand(s_neg.size()).to(self.device)
+        labels = labels * 0 - 100
+        loss = self.criterion(pred, labels) * float((labels >= 0).sum())
+        loss_neg = self.criterion_nll(s_neg.repeat(self.ln_neg, 1), labels_neg.t().contiguous().view(-1)) * float((labels_neg >= 0).sum())
+        loss = ((loss+loss_neg) / (float((labels >= 0).sum())+float((labels_neg[:, 0] >= 0).sum())))
+        return loss
+
+
+class FocalLoss(torch.nn.Module):
+    '''
+        https://github.com/clcarwin/focal_loss_pytorch/blob/master/focalloss.py
+    '''
+
+    def __init__(self, gamma=0, alpha=None, size_average=True):
+        super(FocalLoss, self).__init__()
+        self.gamma = gamma
+        self.alpha = alpha
+        if isinstance(alpha, (float, int)):
+            self.alpha = torch.Tensor([alpha, 1-alpha])
+        if isinstance(alpha, list):
+            self.alpha = torch.Tensor(alpha)
+        self.size_average = size_average
+
+    def forward(self, input, target):
+        if input.dim() > 2:
+            input = input.view(input.size(0), input.size(1), -1)  # N,C,H,W => N,C,H*W
+            input = input.transpose(1, 2)                         # N,C,H*W => N,H*W,C
+            input = input.contiguous().view(-1, input.size(2))    # N,H*W,C => N*H*W,C
+        target = target.view(-1, 1)
+
+        logpt = F.log_softmax(input, dim=1)
+        logpt = logpt.gather(1, target)
+        logpt = logpt.view(-1)
+        pt = torch.autograd.Variable(logpt.data.exp())
+
+        if self.alpha is not None:
+            if self.alpha.type() != input.data.type():
+                self.alpha = self.alpha.type_as(input.data)
+            at = self.alpha.gather(0, target.data.view(-1))
+            logpt = logpt * torch.autograd.Variable(at)
+
+        loss = -1 * (1-pt)**self.gamma * logpt
+        if self.size_average:
+            return loss.mean()
+        else:
+            return loss.sum()
+
+
+class NormalizedFocalLoss(torch.nn.Module):
+    def __init__(self, scale=1.0, gamma=0, num_classes=10, alpha=None, size_average=True):
+        super(NormalizedFocalLoss, self).__init__()
+        self.gamma = gamma
+        self.size_average = size_average
+        self.num_classes = num_classes
+        self.scale = scale
+
+    def forward(self, input, target):
+        target = target.view(-1, 1)
+        logpt = F.log_softmax(input, dim=1)
+        normalizor = torch.sum(-1 * (1 - logpt.data.exp()) ** self.gamma * logpt, dim=1)
+        logpt = logpt.gather(1, target)
+        logpt = logpt.view(-1)
+        pt = torch.autograd.Variable(logpt.data.exp())
+        loss = -1 * (1-pt)**self.gamma * logpt
+        loss = self.scale * loss / normalizor
+
+        if self.size_average:
+            return loss.mean()
+        else:
+            return loss.sum()
+
+
+class NFLandNCE(torch.nn.Module):
+    def __init__(self, alpha, beta, num_classes, gamma=0.5):
+        super(NFLandNCE, self).__init__()
+        self.num_classes = num_classes
+        self.nfl = NormalizedFocalLoss(scale=alpha, gamma=gamma, num_classes=num_classes)
+        self.nce = NormalizedCrossEntropy(scale=beta, num_classes=num_classes)
+
+    def forward(self, pred, labels):
+        return self.nfl(pred, labels) + self.nce(pred, labels)
+
+
+class NFLandMAE(torch.nn.Module):
+    def __init__(self, alpha, beta, num_classes, gamma=0.5):
+        super(NFLandMAE, self).__init__()
+        self.num_classes = num_classes
+        self.nfl = NormalizedFocalLoss(scale=alpha, gamma=gamma, num_classes=num_classes)
+        self.mae = MeanAbsoluteError(scale=beta, num_classes=num_classes)
+
+    def forward(self, pred, labels):
+        return self.nfl(pred, labels) + self.mae(pred, labels)
+
+
+class NFLandRCE(torch.nn.Module):
+    def __init__(self, alpha, beta, num_classes, gamma=0.5):
+        super(NFLandRCE, self).__init__()
+        self.num_classes = num_classes
+        self.nfl = NormalizedFocalLoss(scale=alpha, gamma=gamma, num_classes=num_classes)
+        self.rce = ReverseCrossEntropy(scale=beta, num_classes=num_classes)
+
+    def forward(self, pred, labels):
+        return self.nfl(pred, labels) + self.rce(pred, labels)
+
+
+class DMILoss(torch.nn.Module):
+    def __init__(self, num_classes):
+        super(DMILoss, self).__init__()
+        self.num_classes = num_classes
+
+    def forward(self, output, target):
+        outputs = F.softmax(output, dim=1)
+        targets = target.reshape(target.size(0), 1).cpu()
+        y_onehot = torch.FloatTensor(target.size(0), self.num_classes).zero_()
+        y_onehot.scatter_(1, targets, 1)
+        y_onehot = y_onehot.transpose(0, 1).cuda()
+        mat = y_onehot @ outputs
+        return -1.0 * torch.log(torch.abs(torch.det(mat.float())) + 0.001)
+
+
+class BootSoftLoss(torch.nn.Module):
+    def __init__(self, num_classes, beta=0.95):
+        super(BootSoftLoss, self).__init__()
+        self.device = device
+        self.num_classes = num_classes
+        self.beta = beta
+
+    def forward(self, pred, labels):
+        pred = F.softmax(pred, dim=1)
+        pred = torch.clamp(pred, min=1e-7, max=1.0)
+        label_one_hot = F.one_hot(labels, self.num_classes).float().to(self.device)
+        label_one_hot = torch.clamp(label_one_hot, min=1e-4, max=1.0)
+        bsl = -torch.sum((self.beta * label_one_hot + (1. - self.beta) * pred) * torch.log(pred), dim=1)
+        return bsl.mean()
+    
+    
+class BootHardLoss(torch.nn.Module):
+    def __init__(self, num_classes, beta=0.8):
+        super(BootSoftLoss, self).__init__()
+        self.device = device
+        self.num_classes = num_classes
+        self.beta = beta
+
+    def forward(self, pred, labels):
+        pred = F.softmax(pred, dim=1)
+        pred = torch.clamp(pred, min=1e-7, max=1.0)
+        label_one_hot = F.one_hot(labels, self.num_classes).float().to(self.device)
+        label_one_hot = torch.clamp(label_one_hot, min=1e-4, max=1.0)
+        pred_one_hot = F.one_hot(torch.argmax(pred, dim=1),self.num_classes)
+        bhl = -torch.sum((self.beta * label_one_hot + (1. - self.beta) * pred_one_hot) * torch.log(pred), dim=1)
+        return bhl.mean()
+    
+    
+class ForwardLoss(torch.nn.Module):
+    def __init__(self, num_classes, noise_rate):
+        super(ForwardLoss, self).__init__()
+        self.device = device
+        self.num_classes = num_classes
+        self.noise_rate = noise_rate
+
+    def forward(self, pred, labels):
+        pred = F.softmax(pred, dim=1)
+        pred = torch.clamp(pred, min=1e-7, max=1.0)
+        label_one_hot = F.one_hot(labels, self.num_classes).float().to(self.device)
+        label_one_hot = torch.clamp(label_one_hot, min=1e-4, max=1.0)
+        assert (self.noise_rate >= 0.) and (self.noise_rate <= 1.)
+        P = self.noise_rate / (self.num_classes - 1) * torch.ones((self.num_classes, self.num_classes))
+        P.diagonal().fill_(1-self.noise_rate)
+        P = P.to(self.device)
+        loss = -torch.sum(label_one_hot * torch.log(torch.matmul(pred, P)), dim=-1)
+        return loss.mean()
+    
+class BackwardLoss(torch.nn.Module):
+    def __init__(self, num_classes, noise_rate):
+        super(BackwardLoss, self).__init__()
+        self.device = device
+        self.num_classes = num_classes
+        self.noise_rate = noise_rate
+
+    def forward(self, pred, labels):
+        pred = F.softmax(pred, dim=1)
+        pred = torch.clamp(pred, min=1e-7, max=1.0)
+        label_one_hot = F.one_hot(labels, self.num_classes).float().to(self.device)
+        label_one_hot = torch.clamp(label_one_hot, min=1e-4, max=1.0)
+        assert (self.noise_rate >= 0.) and (self.noise_rate <= 1.)
+        P = self.noise_rate / (self.num_classes - 1) * torch.ones((self.num_classes, self.num_classes))
+        P.diagonal().fill_(1-self.noise_rate)
+        P = P.to(self.device)
+        P_inv = torch.inverse(P)
+        loss=-torch.sum((torch.matmul(label_one_hot, P_inv)) * torch.log(pred), dim=-1)
+        return loss.mean()
+    
+    
+class LIDPacedLoss(torch.nn.Module):
+    def __init__(self, num_classes, alpha, beta1, beta2):
+        super(LIDPacedLoss, self).__init__()
+        self.device = device
+        self.num_classes = num_classes
+        self.alpha = alpha
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.sce = SCELoss(alpha=beta1, beta=beta2, num_classes=num_classes)
+
+    def forward(self, pred, labels):
+        if self.alpha == 1.0:
+            return self.sce(pred, labels)
+        else:
+            pred = F.softmax(pred, dim=1)
+            pred = torch.clamp(pred, min=1e-7, max=1.0)
+            label_one_hot = F.one_hot(labels, self.num_classes).float().to(self.device)
+            label_one_hot = torch.clamp(label_one_hot, min=1e-4, max=1.0)
+            pred_labels = F.one_hot(torch.argmax(pred, dim=1), num_classes=label_one_hot.size(1))
+            y_new = self.alpha * label_one_hot + (1. - self.alpha) * pred_labels
+            loss = -torch.sum(y_new * torch.log(pred), dim=-1)
+            return loss.mean()
\ No newline at end of file
diff --git a/archive/model.py b/archive/model.py
new file mode 100644
index 0000000..d39265e
--- /dev/null
+++ b/archive/model.py
@@ -0,0 +1,197 @@
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ConvBrunch(nn.Module):
+    def __init__(self, in_planes, out_planes, kernel_size=3):
+        super(ConvBrunch, self).__init__()
+        padding = (kernel_size - 1) // 2
+        self.out_conv = nn.Sequential(
+            nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, padding=padding),
+            nn.BatchNorm2d(out_planes),
+            nn.ReLU())
+
+    def forward(self, x):
+        return self.out_conv(x)
+
+
+class SCEModel(nn.Module):
+    def __init__(self, type='cifar10'):
+        super(SCEModel, self).__init__()
+        self.type = type
+        if type == 'cifar10':
+            self.block1 = nn.Sequential(
+                ConvBrunch(3, 64, 3),
+                ConvBrunch(64, 64, 3),
+                nn.MaxPool2d(kernel_size=2, stride=2))
+            self.block2 = nn.Sequential(
+                ConvBrunch(64, 128, 3),
+                ConvBrunch(128, 128, 3),
+                nn.MaxPool2d(kernel_size=2, stride=2))
+            self.block3 = nn.Sequential(
+                ConvBrunch(128, 196, 3),
+                ConvBrunch(196, 196, 3),
+                nn.MaxPool2d(kernel_size=2, stride=2))
+            # self.global_avg_pool = nn.AdaptiveAvgPool2d((1, 1))
+            self.fc1 = nn.Sequential(
+                nn.Linear(3136, 256),
+                nn.BatchNorm1d(256),
+                nn.ReLU())
+            self.fc2 = nn.Linear(256, 10)
+            self.fc_size = 3136
+        elif type == 'mnist':
+            self.block1 = nn.Sequential(
+                ConvBrunch(1, 32, 3),
+                nn.MaxPool2d(kernel_size=2, stride=2))
+            self.block2 = nn.Sequential(
+                ConvBrunch(32, 64, 3),
+                nn.MaxPool2d(kernel_size=2, stride=2))
+            # self.global_avg_pool = nn.AdaptiveAvgPool2d((1, 1))
+            self.fc1 = nn.Sequential(
+                nn.Linear(64*7*7, 128),
+                nn.BatchNorm1d(128),
+                nn.ReLU())
+            self.fc2 = nn.Linear(128, 10)
+            self.fc_size = 64*7*7
+        self._reset_prams()
+
+    def _reset_prams(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_uniform_(m.weight, mode='fan_in', nonlinearity='relu')
+            elif isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+        return
+
+    def forward(self, x):
+        x = self.block1(x)
+        x = self.block2(x)
+        x = self.block3(x) if self.type == 'cifar10' else x
+        # x = self.global_avg_pool(x)
+        x = x.view(-1, self.fc_size)
+        x = self.fc1(x)
+        x = self.fc2(x)
+        return x
+
+
+'''ResNet in PyTorch.
+For Pre-activation ResNet, see 'preact_resnet.py'.
+Reference:
+[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
+    Deep Residual Learning for Image Recognition. arXiv:1512.03385
+'''
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, in_planes, planes, stride=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion*planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(self.expansion*planes)
+            )
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))
+        out += self.shortcut(x)
+        out = F.relu(out)
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, in_planes, planes, stride=1):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(self.expansion*planes)
+
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion*planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(self.expansion*planes)
+            )
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = F.relu(self.bn2(self.conv2(out)))
+        out = self.bn3(self.conv3(out))
+        out += self.shortcut(x)
+        out = F.relu(out)
+        return out
+
+
+class ResNet(nn.Module):
+    def __init__(self, block, num_blocks, num_classes=10):
+        super(ResNet, self).__init__()
+        self.in_planes = 64
+
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
+        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
+        self.linear = nn.Linear(512*block.expansion, num_classes)
+        self._reset_prams()
+
+    def _make_layer(self, block, planes, num_blocks, stride):
+        strides = [stride] + [1]*(num_blocks-1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.in_planes, planes, stride))
+            self.in_planes = planes * block.expansion
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = self.layer4(out)
+        out = F.avg_pool2d(out, 4)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+    def _reset_prams(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_uniform_(m.weight, mode='fan_in', nonlinearity='relu')
+            elif isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+        return
+
+
+def ResNet18(num_classes=10):
+    return ResNet(BasicBlock, [2, 2, 2, 2], num_classes=num_classes)
+
+
+def ResNet34(num_classes=10):
+    return ResNet(BasicBlock, [3, 4, 6, 3], num_classes=num_classes)
+
+
+def ResNet50(num_classes=10):
+    return ResNet(Bottleneck, [3, 4, 6, 3], num_classes=num_classes)
+
+
+def ResNet101(num_classes=10):
+    return ResNet(Bottleneck, [3, 4, 23, 3], num_classes=num_classes)
+
+
+def ResNet152(num_classes=10):
+    return ResNet(Bottleneck, [3, 8, 36, 3], num_classes=num_classes)
diff --git a/archive/train.py b/archive/train.py
new file mode 100644
index 0000000..13bafc7
--- /dev/null
+++ b/archive/train.py
@@ -0,0 +1,382 @@
+import argparse
+import torch
+import time
+import os
+import collections
+import pickle
+import logging
+import torchvision
+from tqdm import tqdm
+from model import SCEModel, ResNet34
+from dataset import DatasetGenerator, Clothing1MDatasetLoader, ImageNetDatasetLoader
+from utils.utils import AverageMeter, accuracy, count_parameters_in_MB
+from torch.optim.lr_scheduler import CosineAnnealingLR, MultiStepLR
+from train_util import TrainUtil
+from loss import *
+
+# ArgParse
+parser = argparse.ArgumentParser(description='RobustLoss')
+parser.add_argument('--lr', type=float, default=0.01)
+parser.add_argument('--l2_reg', type=float, default=1e-4)
+parser.add_argument('--grad_bound', type=float, default=5.0)
+parser.add_argument('--train_log_every', type=int, default=100)
+parser.add_argument('--resume', action='store_true', default=False)
+parser.add_argument('--batch_size', type=int, default=128)
+parser.add_argument('--data_path', default='data', type=str)
+parser.add_argument('--checkpoint_path', default='checkpoints/cifar10/', type=str)
+parser.add_argument('--data_nums_workers', type=int, default=4)
+parser.add_argument('--epoch', type=int, default=150)
+parser.add_argument('--nr', type=float, default=0.4, help='noise_rate')
+parser.add_argument('--loss', type=str, default='SCE', help='SCE, CE, NCE, MAE, RCE')
+parser.add_argument('--alpha', type=float, default=1.0, help='alpha scale')
+parser.add_argument('--beta', type=float, default=1.0, help='beta scale')
+parser.add_argument('--q', type=float, default=0.7, help='q for gce')
+parser.add_argument('--gamma', type=float, default=2, help='gamma for FocalLoss')
+parser.add_argument('--dataset_type', choices=['mnist', 'cifar10', 'cifar100', 'clothing1m', 'imagenet'], type=str, default='cifar10')
+parser.add_argument('--scale_exp', action='store_true', default=False)
+parser.add_argument('--alpha_beta_exp', action='store_true', default=False)
+parser.add_argument('--version', type=str, default='robust_loss')
+parser.add_argument('--run_version', type=str, default='run1')
+parser.add_argument('--asym', action='store_true', default=False)
+parser.add_argument('--seed', type=int, default=123)
+args = parser.parse_args()
+
+if args.dataset_type == 'cifar100':
+    args.checkpoint_path = 'checkpoints/cifar100/'
+    log_dataset_type = 'cifar100'
+elif args.dataset_type == 'cifar10':
+    args.checkpoint_path = 'checkpoints/cifar10/'
+    log_dataset_type = 'cifar10'
+elif args.dataset_type == 'mnist':
+    args.checkpoint_path = 'checkpoints/mnist/'
+    log_dataset_type = 'mnist'
+elif args.dataset_type == 'clothing1m':
+    args.checkpoint_path = 'checkpoints/clothing1m/'
+    log_dataset_type = 'clothing1m'
+elif args.dataset_type == 'imagenet':
+    args.checkpoint_path = 'checkpoints/ILSVR2012/'
+    log_dataset_type = 'imagenet'
+else:
+    raise('Unknown Dataset')
+
+log_sym_type = ''
+if args.dataset_type == 'clothing1m':
+    log_dataset_type = 'clothing1m'
+elif args.dataset_type == 'imagenet':
+    log_dataset_type = 'imagenet'
+elif not args.dataset_type == 'clothing1m':
+    args.version = str(args.nr) + 'nr_' + args.loss.lower()
+    if args.scale_exp:
+        args.version += '_scale_' + str(args.alpha)
+    elif args.alpha_beta_exp:
+        args.version += '_ab_' + str(args.alpha) + '_' + str(args.beta)
+    if args.asym:
+        log_sym_type = 'asym'
+        args.version += '_asym'
+        args.checkpoint_path += 'asym/' + args.run_version + '/'
+    else:
+        log_sym_type = 'sym'
+        args.checkpoint_path += 'sym/' + args.run_version + '/'
+
+
+if not os.path.exists(args.checkpoint_path):
+    os.makedirs(args.checkpoint_path)
+if not os.path.exists(os.path.join('logs', log_dataset_type, log_sym_type, args.run_version)):
+    os.makedirs(os.path.join('logs', log_dataset_type, log_sym_type, args.run_version))
+
+
+def setup_logger(name, log_file, level=logging.INFO):
+    """To setup as many loggers as you want"""
+    formatter = logging.Formatter('%(asctime)s %(message)s')
+    handler = logging.FileHandler(log_file)
+    handler.setFormatter(formatter)
+
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+    logger.addHandler(handler)
+
+    return logger
+
+
+log_file_name = os.path.join('logs', log_dataset_type, log_sym_type, args.run_version, args.version)
+logger = setup_logger(name=args.version, log_file=log_file_name + ".log")
+GLOBAL_STEP, EVAL_STEP, EVAL_BEST_ACC = 0, 0, 0
+TRAIN_HISTORY = collections.defaultdict(list)
+torch.manual_seed(args.seed)
+
+if torch.cuda.is_available():
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.benchmark = True
+    torch.backends.cudnn.deterministic = True
+    if torch.cuda.device_count() > 1:
+        device = torch.device('cuda:0')
+    else:
+        device = torch.device('cuda')
+else:
+    device = torch.device('cpu')
+
+
+def log_display(epoch, global_step, time_elapse, **kwargs):
+    display = 'epoch=' + str(epoch) + \
+              '\tglobal_step=' + str(global_step)
+    for key, value in kwargs.items():
+        display += '\t' + str(key) + '=%.5f' % value
+    display += '\ttime=%.2fit/s' % (1. / time_elapse)
+    return display
+
+
+def model_eval(epoch, fixed_cnn, data_loader, dataset_type='test_dataset'):
+    global EVAL_STEP
+    fixed_cnn.eval()
+    valid_loss_meters = AverageMeter()
+    valid_acc_meters = AverageMeter()
+    valid_acc5_meters = AverageMeter()
+    ce_loss = torch.nn.CrossEntropyLoss()
+
+    for images, labels in tqdm(data_loader[dataset_type]):
+        start = time.time()
+        images, labels = images.to(device, non_blocking=True), labels.to(device, non_blocking=True)
+        with torch.no_grad():
+            pred = fixed_cnn(images)
+            loss = ce_loss(pred, labels)
+            acc, acc5 = accuracy(pred, labels, topk=(1, 5))
+
+        valid_loss_meters.update(loss.item(), labels.shape[0])
+        valid_acc_meters.update(acc.item(), labels.shape[0])
+        valid_acc5_meters.update(acc5.item(), labels.shape[0])
+        end = time.time()
+
+        EVAL_STEP += 1
+        if EVAL_STEP % args.train_log_every == 0:
+            display = log_display(epoch=epoch,
+                                  global_step=GLOBAL_STEP,
+                                  time_elapse=end-start,
+                                  loss=loss.item(),
+                                  test_loss_avg=valid_loss_meters.avg,
+                                  acc=acc.item(),
+                                  test_acc_avg=valid_acc_meters.avg,
+                                  test_acc_top5_avg=valid_acc5_meters.avg)
+            logger.info(display)
+    display = log_display(epoch=epoch,
+                          global_step=GLOBAL_STEP,
+                          time_elapse=end-start,
+                          loss=loss.item(),
+                          test_loss_avg=valid_loss_meters.avg,
+                          acc=acc.item(),
+                          test_acc_avg=valid_acc_meters.avg,
+                          test_acc_top5_avg=valid_acc5_meters.avg)
+    logger.info(display)
+    return valid_acc_meters.avg, valid_acc5_meters.avg
+
+
+def train_fixed(starting_epoch, data_loader, fixed_cnn, criterion, fixed_cnn_optmizer, fixed_cnn_scheduler, utilHelper):
+    global GLOBAL_STEP, reduction_arc, cell_arc, EVAL_BEST_ACC, EVAL_STEP, TRAIN_HISTORY
+
+    for epoch in tqdm(range(starting_epoch, args.epoch)):
+        logger.info("=" * 20 + "Training" + "=" * 20)
+        fixed_cnn.train()
+        train_loss_meters = AverageMeter()
+        train_acc_meters = AverageMeter()
+        train_acc5_meters = AverageMeter()
+
+        for images, labels in tqdm(data_loader["train_dataset"]):
+            images, labels = images.to(device, non_blocking=True), labels.to(device, non_blocking=True)
+            start = time.time()
+            fixed_cnn.zero_grad()
+            fixed_cnn_optmizer.zero_grad()
+            pred = fixed_cnn(images)
+            loss = criterion(pred, labels)
+            loss.backward()
+            grad_norm = torch.nn.utils.clip_grad_norm_(fixed_cnn.parameters(), args.grad_bound)
+            fixed_cnn_optmizer.step()
+            acc, acc5 = accuracy(pred, labels, topk=(1, 5))
+
+            train_loss_meters.update(loss.item(), labels.shape[0])
+            train_acc_meters.update(acc.item(), labels.shape[0])
+            train_acc5_meters.update(acc5.item(), labels.shape[0])
+
+            end = time.time()
+
+            GLOBAL_STEP += 1
+            if GLOBAL_STEP % args.train_log_every == 0:
+                lr = fixed_cnn_optmizer.param_groups[0]['lr']
+                display = log_display(epoch=epoch,
+                                      global_step=GLOBAL_STEP,
+                                      time_elapse=end-start,
+                                      loss=loss.item(),
+                                      loss_avg=train_loss_meters.avg,
+                                      acc=acc.item(),
+                                      acc_top1_avg=train_acc_meters.avg,
+                                      acc_top5_avg=train_acc5_meters.avg,
+                                      lr=lr,
+                                      gn=grad_norm)
+                logger.info(display)
+        if fixed_cnn_scheduler is not None:
+            fixed_cnn_scheduler.step()
+        logger.info("="*20 + "Eval" + "="*20)
+        curr_acc, _ = model_eval(epoch, fixed_cnn, data_loader)
+        logger.info("curr_acc\t%.4f" % curr_acc)
+        logger.info("BEST_ACC\t%.4f" % EVAL_BEST_ACC)
+        payload = '=' * 10 + '\n'
+        payload = payload + ("curr_acc: %.4f\n best_acc: %.4f\n" % (curr_acc, EVAL_BEST_ACC))
+        EVAL_BEST_ACC = max(curr_acc, EVAL_BEST_ACC)
+        TRAIN_HISTORY["train_loss"].append(train_loss_meters.avg)
+        TRAIN_HISTORY["train_acc"].append(train_acc_meters.avg)
+        TRAIN_HISTORY["test_acc"].append(curr_acc)
+        TRAIN_HISTORY["test_acc_best"] = [EVAL_BEST_ACC]
+        with open(args.checkpoint_path + args.version + '.pickle', 'wb') as handle:
+            pickle.dump(TRAIN_HISTORY, handle, protocol=pickle.HIGHEST_PROTOCOL)
+        logger.info("Saved!\n")
+    return
+
+
+def train():
+    # Dataset
+    if args.dataset_type == 'clothing1m':
+        dataset = Clothing1MDatasetLoader(batchSize=args.batch_size,
+                                          dataPath=args.data_path,
+                                          numOfWorkers=args.data_nums_workers)
+    elif args.dataset_type == 'imagenet':
+        dataset = ImageNetDatasetLoader(batchSize=args.batch_size,
+                                        dataPath=args.data_path,
+                                        seed=args.seed,
+                                        target_class_num=200,
+                                        nosiy_rate=0.4,
+                                        numOfWorkers=args.data_nums_workers)
+    else:
+        dataset = DatasetGenerator(batchSize=args.batch_size,
+                                   dataPath=args.data_path,
+                                   numOfWorkers=args.data_nums_workers,
+                                   noise_rate=args.nr,
+                                   asym=args.asym,
+                                   seed=args.seed,
+                                   dataset_type=args.dataset_type)
+
+    dataLoader = dataset.getDataLoader()
+    eta_min = 0
+    ln_neg = 1
+
+    if args.dataset_type == 'clothing1m':
+        # Train Clothing1M
+        args.epoch = 20
+        args.l2_reg = 1e-3
+        num_classes = 14
+        fixed_cnn = torchvision.models.resnet50(num_classes=14)
+        # fixed_cnn.fc = torch.nn.Linear(2048, 14)
+
+    elif args.dataset_type == 'cifar100':
+        # Train CIFAR100
+        args.lr = 0.1
+        args.epoch = 200
+        num_classes = 100
+        fixed_cnn = ResNet34(num_classes=num_classes)
+
+        # NLNL
+        if args.loss == 'NLNL':
+            args.epoch = 2000
+            ln_neg = 110
+
+    elif args.dataset_type == 'cifar10':
+        # Train CIFAR10
+        args.epoch = 120
+        num_classes = 10
+        fixed_cnn = SCEModel(type='cifar10')
+
+        # NLNL
+        if args.loss == 'NLNL':
+            args.epoch = 1000
+
+    elif args.dataset_type == 'mnist':
+        # Train mnist
+        args.epoch = 50
+        num_classes = 10
+        fixed_cnn = SCEModel(type='mnist')
+        eta_min = 0.001
+        args.l2_reg = 1e-3
+        # NLNL
+        if args.loss == 'NLNL':
+            args.epoch = 720
+
+    elif args.dataset_type == 'imagenet':
+        args.epoch = 100
+        args.l2_reg = 3e-5
+        num_classes = 200
+        fixed_cnn = torchvision.models.resnet50(num_classes=num_classes)
+
+    logger.info("num_classes: %s" % (num_classes))
+
+    loss_options = {
+        'SCE': SCELoss(alpha=args.alpha, beta=args.beta, num_classes=num_classes),
+        'CE': torch.nn.CrossEntropyLoss(),
+        'NCE': NormalizedCrossEntropy(scale=args.alpha, num_classes=num_classes),
+        'MAE': MeanAbsoluteError(scale=args.alpha, num_classes=num_classes),
+        'NMAE': NormalizedMeanAbsoluteError(scale=args.alpha, num_classes=num_classes),
+        'GCE': GeneralizedCrossEntropy(num_classes=num_classes, q=args.q),
+        'RCE': ReverseCrossEntropy(scale=args.alpha, num_classes=num_classes),
+        'NRCE': NormalizedReverseCrossEntropy(scale=args.alpha, num_classes=num_classes),
+        'NGCE': NormalizedGeneralizedCrossEntropy(scale=args.alpha, num_classes=num_classes, q=args.q),
+        'NCEandRCE': NCEandRCE(alpha=args.alpha, beta=args.beta, num_classes=num_classes),
+        'NCEandMAE': NCEandMAE(alpha=args.alpha, beta=args.beta, num_classes=num_classes),
+        'GCEandMAE': GCEandMAE(alpha=args.alpha, beta=args.beta, num_classes=num_classes, q=args.q),
+        'GCEandRCE': GCEandRCE(alpha=args.alpha, beta=args.beta, num_classes=num_classes, q=args.q),
+        'GCEandNCE': GCEandNCE(alpha=args.alpha, beta=args.beta, num_classes=num_classes, q=args.q),
+        'MAEandRCE': MAEandRCE(alpha=args.alpha, beta=args.beta, num_classes=num_classes),
+        'NGCEandNCE': NGCEandNCE(alpha=args.alpha, beta=args.beta, num_classes=num_classes, q=args.q),
+        'NGCEandMAE': NGCEandMAE(alpha=args.alpha, beta=args.beta, num_classes=num_classes, q=args.q),
+        'NGCEandRCE': NGCEandRCE(alpha=args.alpha, beta=args.beta, num_classes=num_classes, q=args.q),
+        'FocalLoss': FocalLoss(gamma=args.gamma),
+        'NFL': NormalizedFocalLoss(scale=args.alpha, gamma=args.gamma, num_classes=num_classes),
+        'NLNL': NLNL(num_classes=num_classes, train_loader=dataLoader['train_dataset'], ln_neg=ln_neg),
+        'NFLandNCE': NFLandNCE(alpha=args.alpha, beta=args.beta, gamma=args.gamma, num_classes=num_classes),
+        'NFLandMAE': NFLandMAE(alpha=args.alpha, beta=args.beta, gamma=args.gamma, num_classes=num_classes),
+        'NFLandRCE': NFLandRCE(alpha=args.alpha, beta=args.beta, gamma=args.gamma, num_classes=num_classes),
+        'DMI': DMILoss(num_classes=num_classes)
+    }
+
+    if args.loss in loss_options:
+        criterion = loss_options[args.loss]
+    else:
+        raise("Unknown loss")
+
+    logger.info(criterion.__class__.__name__)
+    logger.info("Number of Trainable Parameters %.4f" % count_parameters_in_MB(fixed_cnn))
+
+    fixed_cnn.to(device)
+
+    if args.loss == 'DMI':
+        criterion = loss_options['CE']
+
+    fixed_cnn_optmizer = torch.optim.SGD(params=fixed_cnn.parameters(),
+                                         lr=args.lr,
+                                         momentum=0.9,
+                                         weight_decay=args.l2_reg)
+
+    fixed_cnn_scheduler = CosineAnnealingLR(fixed_cnn_optmizer,
+                                            float(args.epoch),
+                                            eta_min=eta_min)
+    if args.dataset_type == 'clothing1m':
+        fixed_cnn_scheduler = MultiStepLR(fixed_cnn_optmizer, milestones=[5, 10], gamma=0.1)
+    elif args.dataset_type == 'imagenet':
+        fixed_cnn_scheduler = MultiStepLR(fixed_cnn_optmizer, milestones=[30, 60, 80], gamma=0.1)
+
+    utilHelper = TrainUtil(checkpoint_path=args.checkpoint_path, version=args.version)
+    starting_epoch = 0
+
+    for arg in vars(args):
+        logger.info("%s: %s" % (arg, getattr(args, arg)))
+
+    train_fixed(starting_epoch, dataLoader, fixed_cnn, criterion, fixed_cnn_optmizer, fixed_cnn_scheduler, utilHelper)
+
+    if args.loss == 'DMI':
+        criterion = loss_options['DMI']
+        fixed_cnn_optmizer = torch.optim.SGD(params=fixed_cnn.parameters(),
+                                             lr=1e-6,
+                                             momentum=0.9,
+                                             weight_decay=args.l2_reg)
+        starting_epoch = 0
+        fixed_cnn_scheduler = None
+        train_fixed(starting_epoch, dataLoader, fixed_cnn, criterion, fixed_cnn_optmizer, fixed_cnn_scheduler, utilHelper)
+
+
+if __name__ == '__main__':
+    train()
diff --git a/archive/train_util.py b/archive/train_util.py
new file mode 100644
index 0000000..9484945
--- /dev/null
+++ b/archive/train_util.py
@@ -0,0 +1,106 @@
+import torch
+import os
+import pickle
+
+if torch.cuda.is_available():
+    torch.backends.cudnn.benchmark = True
+    if torch.cuda.device_count() > 1:
+        device = torch.device('cuda:0')
+    else:
+        device = torch.device('cuda')
+else:
+    device = torch.device('cpu')
+
+
+class TrainUtil():
+    def __init__(self, checkpoint_path='checkpoints', version='mcts_nas_net_v1'):
+        self.checkpoint_path = checkpoint_path
+        self.version = version
+        return
+
+    def save_model_fixed(self, epoch, fixed_cnn, fixed_cnn_optmizer, save_best=False, **kwargs):
+        filename = os.path.join(self.checkpoint_path, self.version) + '.pth'
+        # Torch Save State Dict
+        state = {
+            'epoch': epoch+1,
+            'shared_cnn': fixed_cnn.state_dict(),
+            'shared_cnn_optmizer': fixed_cnn_optmizer.state_dict(),
+        }
+        for key, value in kwargs.items():
+            state[key] = value
+        torch.save(state, filename)
+        filename = os.path.join(self.checkpoint_path, self.version) + '_best.pth'
+        if save_best:
+            torch.save(state, filename)
+        return
+
+    def load_model_fixed(self, fixed_cnn, fixed_cnn_optmizer, **kwargs):
+        filename = os.path.join(self.checkpoint_path, self.version) + '.pth'
+        # Load Torch State Dict
+        checkpoints = torch.load(filename)
+        fixed_cnn.load_state_dict(checkpoints['fixed_cnn'])
+        fixed_cnn_optmizer.load_state_dict(checkpoints['fixed_cnn_optmizer'])
+        print(filename + " Loaded!")
+        return checkpoints
+
+    def save_model(self,
+                   mcts,
+                   shared_cnn,
+                   shared_cnn_optmizer,
+                   shared_cnn_schduler,
+                   estimator,
+                   estimator_optmizer,
+                   epoch,
+                   **kwargs):
+        mcts_filename = os.path.join(self.checkpoint_path, self.version) + '_mcts' + '.pkl'
+        filename = os.path.join(self.checkpoint_path, self.version) + '.pth'
+
+        # Torch Save State Dict
+        state = {
+            'epoch': epoch+1,
+            'shared_cnn': shared_cnn.state_dict(),
+            'shared_cnn_optmizer': shared_cnn_optmizer.state_dict(),
+            'shared_cnn_schduler': shared_cnn_schduler.state_dict(),
+            'estimator': estimator.state_dict(),
+            'estimator_optmizer': estimator_optmizer.state_dict()
+        }
+        for key, value in kwargs.items():
+            state[key] = value
+        torch.save(state, filename)
+        print(filename + " saved!")
+
+        # Save MCTS to pickle
+        rolloutPolicy, searchPolicy = mcts.rollout, mcts.searchPolicy
+        mcts.rollout, mcts.searchPolicy = None, None
+        with open(mcts_filename, 'wb') as handle:
+            pickle.dump(mcts, handle, protocol=pickle.HIGHEST_PROTOCOL)
+        print(mcts_filename + " Saved!")
+        mcts.rollout, mcts.searchPolicy = rolloutPolicy, searchPolicy
+        return
+
+    def load_model(self,
+                   shared_cnn,
+                   shared_cnn_optmizer,
+                   shared_cnn_schduler,
+                   estimator,
+                   estimator_optmizer,
+                   **kwargs):
+
+        filename = os.path.join(self.checkpoint_path, self.version) + '.pth'
+        mcts_filename = os.path.join(self.checkpoint_path, self.version) + '_mcts' + '.pkl'
+
+        # Load Torch State Dict
+        checkpoints = torch.load(filename)
+        shared_cnn.load_state_dict(checkpoints['shared_cnn'])
+        shared_cnn_optmizer.load_state_dict(checkpoints['shared_cnn_optmizer'])
+        shared_cnn_schduler.load_state_dict(checkpoints['shared_cnn_schduler'])
+        shared_cnn_schduler.optimizer = shared_cnn_optmizer
+        estimator.load_state_dict(checkpoints['estimator'])
+        estimator_optmizer.load_state_dict(checkpoints['estimator_optmizer'])
+        print(filename + " Loaded!")
+
+        # Load MCTS
+        with open(mcts_filename, 'rb') as handle:
+            mcts = pickle.load(handle)
+        print(mcts_filename + " Loaded!")
+        return checkpoints, mcts
diff --git a/archive/utils/__init__.py b/archive/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/archive/utils/utils.py b/archive/utils/utils.py
new file mode 100644
index 0000000..b72833e
--- /dev/null
+++ b/archive/utils/utils.py
@@ -0,0 +1,85 @@
+import csv
+import sys
+import numpy as np
+
+
+class CSVLogger():
+    def __init__(self, args, fieldnames, filename='log.csv'):
+
+        self.filename = filename
+        self.csv_file = open(filename, 'w')
+
+        # Write model configuration at top of csv
+        writer = csv.writer(self.csv_file)
+        for arg in vars(args):
+            writer.writerow([arg, getattr(args, arg)])
+        writer.writerow([''])
+
+        self.writer = csv.DictWriter(self.csv_file, fieldnames=fieldnames)
+        self.writer.writeheader()
+
+        self.csv_file.flush()
+
+    def writerow(self, row):
+        self.writer.writerow(row)
+        self.csv_file.flush()
+
+    def close(self):
+        self.csv_file.close()
+
+
+class Logger(object):
+    def __init__(self, filename):
+        self.terminal = sys.stdout
+        self.log = open(filename, 'w')
+
+    def write(self, message):
+        self.terminal.write(message)
+        self.log.write(message)
+        self.log.flush()
+
+    def flush(self):
+        # this flush method is needed for python 3 compatibility.
+        # this handles the flush command by doing nothing.
+        # you might want to specify some extra behavior here.
+        pass
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+        self.max = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+        self.max = max(self.max, val)
+
+
+def accuracy(output, target, topk=(1,)):
+    maxk = max(topk)
+
+    batch_size = target.size(0)
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0)
+        res.append(correct_k.mul_(1/batch_size))
+    return res
+
+
+def count_parameters_in_MB(model):
+    return sum(np.prod(v.size()) for name, v in model.named_parameters() if "auxiliary_head" not in name)/1e6
diff --git a/callback_util.py b/callback_util.py
index 8bd8647..2372c85 100644
--- a/callback_util.py
+++ b/callback_util.py
@@ -1,26 +1,19 @@
 import numpy as np
-import keras.backend as K
-from keras.utils import np_utils
-from keras.callbacks import Callback, LearningRateScheduler
-from keras.optimizers import SGD
+import torch
 from util import get_lids_random_batch
-from loss import cross_entropy, lid_paced_loss
-from lass_tf import lass
-import tensorflow as tf
+import os
 
 
-class D2LCallback(Callback):
-    def __init__(self, model, X_train, y_train, dataset, noise_ratio, epochs=150,
+class D2LCallback:
+    def __init__(self, model, data_loader, device, epochs=120,
                  pace_type='d2l', init_epoch=5, epoch_win=5, lid_subset_size=1280,
-                 lid_k=20, verbose=1):
+                 lid_k=20, verbose=1, is_found_turning_point=False):
         super(D2LCallback, self).__init__()
         self.validation_data = None
         self.model = model
         self.turning_epoch = -1
-        self.X_train = X_train
-        self.y_train = y_train
-        self.dataset = dataset
-        self.noise_ratio = noise_ratio
+        self.data_loader = data_loader
+        self.device = device
         self.epochs = epochs
         self.pace_type = pace_type
         self.mean_lid = -1.
@@ -32,11 +25,11 @@ def __init__(self, model, X_train, y_train, dataset, noise_ratio, epochs=150,
         self.lid_k = lid_k
         self.verbose = verbose
         self.alpha = 1.0
+        self.is_found_turning_point = is_found_turning_point
 
     def on_epoch_begin(self, epoch, logs={}):
-        rand_idxes = np.random.choice(self.X_train.shape[0], self.lid_subset_size, replace=False)
-        lid = np.mean(get_lids_random_batch(self.model, self.X_train[rand_idxes], k=self.lid_k, batch_size=128))
-
+        lids_tem = get_lids_random_batch(self.model, self.data_loader, self.device, k=20, batch_size=128)
+        lid = lids_tem.mean().item()
         self.p_lambda = epoch*1./self.epochs
 
         # deal with possible illegal lid value
@@ -48,6 +41,7 @@ def on_epoch_begin(self, epoch, logs={}):
         # find the turning point where to apply lid-paced learning strategy
         if self.found_turning_point(self.lids):
             self.update_learning_pace()
+            self.is_found_turning_point = True
 
         if len(self.lids) > 5:
             print('lid = ..., ', self.lids[-5:])
@@ -56,172 +50,30 @@ def on_epoch_begin(self, epoch, logs={}):
 
         if self.verbose > 0:
             print('--Epoch: %s, LID: %.2f, min LID: %.2f, lid window: %s, turning epoch: %s, lambda: %.2f' %
-                  (epoch, lid, np.min(self.lids), self.epoch_win, self.turning_epoch, self.p_lambda))
+                  (epoch, lid, min(self.lids), self.epoch_win, self.turning_epoch, self.p_lambda))
 
         return
 
     def found_turning_point(self, lids):
-        if len(lids) > self.init_epoch + self.epoch_win: #
-            if self.turning_epoch > -1: # if turning point is already found, stop checking
+        if len(lids) > self.init_epoch + self.epoch_win:
+            if self.turning_epoch > -1:
                 return True
             else:
                 smooth_lids = lids[-self.epoch_win-1:-1]
-                # self.mean_lid = np.mean(smooth_lids)
-                if lids[-1] - np.mean(smooth_lids) > 2*np.std(smooth_lids):
+                if lids[-1] - torch.mean(torch.tensor(smooth_lids)) > 2 * torch.std(torch.tensor(smooth_lids)):
                     self.turning_epoch = len(lids) - 2
                     # rollback model if you want, should be called before checkpoint callback
                     # otherwise need to save two models
-                    min_model_path = 'model/%s_%s_%s.hdf5' % (self.pace_type,
-                                                                     self.dataset,
-                                                                     self.noise_ratio)
-                    self.model.load_weights(min_model_path)
                     return True
-        else:
-            return False
+        return False
 
     def update_learning_pace(self):
-        # # this loss is not working for d2l learning, somehow, why???
-        expansion = self.lids[-1] / np.min(self.lids)
-        self.alpha = np.exp(-self.p_lambda * expansion)
+        expansion = self.lids[-1] / min(self.lids)
+        self.alpha = torch.exp(torch.tensor(-self.p_lambda * expansion)).item()
         # self.alpha = np.exp(-0.1*expansion)
 
         print('## Turning epoch: %s, lambda: %.2f, expansion: %.2f, alpha: %.2f' %
               (self.turning_epoch, self.p_lambda, expansion, self.alpha))
 
         # self.alpha = np.exp(-expansion)
-        self.model.compile(loss=lid_paced_loss(self.alpha),
-                           optimizer=self.model.optimizer, metrics=['accuracy'])
-
-
-class LoggerCallback(Callback):
-    """
-    Log train/val loss and acc into file for later plots.
-    """
-    def __init__(self, model, X_train, y_train, X_test, y_test, dataset,
-                 model_name, noise_ratio, epochs):
-        super(LoggerCallback, self).__init__()
-        self.model = model
-        self.X_train = X_train
-        self.y_train = y_train
-        self.X_test = X_test
-        self.y_test = y_test
-        self.dataset = dataset
-        self.model_name = model_name
-        self.noise_ratio = noise_ratio
-        self.epochs = epochs
-
-        self.train_loss = []
-        self.test_loss = []
-        self.train_acc = []
-        self.test_acc = []
 
-        # the followings are used to estimate LID
-        self.lid_k = 20
-        self.lid_subset = 128
-        self.lids = []
-
-        # complexity - Critical Sample Ratio (csr)
-        self.csr_subset = 500
-        self.csr_batchsize = 100
-        self.csrs = []
-
-    def on_epoch_end(self, epoch, logs={}):
-        tr_acc = logs.get('acc')
-        tr_loss = logs.get('loss')
-        val_loss = logs.get('val_loss')
-        val_acc = logs.get('val_acc')
-        # te_loss, te_acc = self.model.evaluate(self.X_test, self.y_test, batch_size=128, verbose=0)
-        self.train_loss.append(tr_loss)
-        self.test_loss.append(val_loss)
-        self.train_acc.append(tr_acc)
-        self.test_acc.append(val_acc)
-
-        file_name = 'log/loss_%s_%s_%s.npy' % \
-                    (self.model_name, self.dataset, self.noise_ratio)
-        np.save(file_name, np.stack((np.array(self.train_loss), np.array(self.test_loss))))
-        file_name = 'log/acc_%s_%s_%s.npy' % \
-                    (self.model_name, self.dataset, self.noise_ratio)
-        np.save(file_name, np.stack((np.array(self.train_acc), np.array(self.test_acc))))
-
-        # print('\n--Epoch %02d, train_loss: %.2f, train_acc: %.2f, val_loss: %.2f, val_acc: %.2f' %
-        #       (epoch, tr_loss, tr_acc, val_loss, val_acc))
-
-        # calculate LID/CSR and save every 10 epochs
-        if epoch % 1 == 0:
-            # compute lid scores
-            rand_idxes = np.random.choice(self.X_train.shape[0], self.lid_subset * 10, replace=False)
-            lid = np.mean(get_lids_random_batch(self.model, self.X_train[rand_idxes],
-                                                k=self.lid_k, batch_size=self.lid_subset))
-            self.lids.append(lid)
-
-            file_name = 'log/lid_%s_%s_%s.npy' % \
-                        (self.model_name, self.dataset, self.noise_ratio)
-            np.save(file_name, np.array(self.lids))
-
-            if len(np.array(self.lids).flatten()) > 20:
-                print('lid = ...', self.lids[-20:])
-            else:
-                print('lid = ', self.lids)
-
-            # compute csr scores
-            # LASS to estimate the critical sample ratio
-            scale_factor = 255. / (np.max(self.X_test) - np.min(self.X_test))
-            y = tf.placeholder(tf.float32, shape=(None,) + self.y_test.shape[1:])
-            csr_model = lass(self.model.layers[0].input, self.model.layers[-1].output, y,
-                             a=0.25 / scale_factor,
-                             b=0.2 / scale_factor,
-                             r=0.3 / scale_factor,
-                             iter_max=100)
-            rand_idxes = np.random.choice(self.X_test.shape[0], self.csr_subset, replace=False)
-            X_adv, adv_ind = csr_model.find(self.X_test[rand_idxes], bs=self.csr_batchsize)
-            csr = np.sum(adv_ind) * 1. / self.csr_subset
-            self.csrs.append(csr)
-
-            file_name = 'log/csr_%s_%s_%s.npy' % \
-                        (self.model_name, self.dataset, self.noise_ratio)
-            np.save(file_name, np.array(self.csrs))
-
-            if len(self.csrs) > 20:
-                print('csr = ...', self.csrs[-20:])
-            else:
-                print('csr = ', self.csrs)
-
-        return
-
-def get_lr_scheduler(dataset):
-    """
-    customerized learning rate decay for training with clean labels.
-     For efficientcy purpose we use large lr for noisy data.
-    :param dataset: 
-    :param noise_ratio:
-    :return: 
-    """
-    if dataset in ['mnist', 'svhn']:
-        def scheduler(epoch):
-            if epoch > 40:
-                return 0.001
-            elif epoch > 20:
-                return 0.01
-            else:
-                return 0.1
-        return LearningRateScheduler(scheduler)
-    elif dataset in ['cifar-10']:
-        def scheduler(epoch):
-            if epoch > 80:
-                return 0.001
-            elif epoch > 40:
-                return 0.01
-            else:
-                return 0.1
-        return LearningRateScheduler(scheduler)
-    elif dataset in ['cifar-100']:
-        def scheduler(epoch):
-            if epoch > 160:
-                return 0.0001
-            elif epoch > 120:
-                return 0.001
-            elif epoch > 80:
-                return 0.01
-            else:
-                return 0.1
-        return LearningRateScheduler(scheduler)
diff --git a/complexity_plot.py b/complexity_plot.py
deleted file mode 100644
index e3cd6c5..0000000
--- a/complexity_plot.py
+++ /dev/null
@@ -1,124 +0,0 @@
-"""
-Train test error/accuracy/loss plot.
-
-Author: Xingjun Ma
-"""
-import os
-import numpy as np
-import tensorflow as tf
-import keras.backend as K
-from keras.datasets import mnist, cifar10
-from keras.optimizers import SGD
-from keras.utils import to_categorical
-import matplotlib.pyplot as plt
-from datasets import get_data
-from models import get_model
-from loss import cross_entropy
-from lass_tf import lass
-
-np.random.seed(1024)
-
-MODELS = ['ce', 'forward', 'backward', 'boot_soft', 'boot_hard', 'd2l']
-MODEL_LABELS = ['cross-entropy', 'forward', 'backward', 'boot-soft', 'boot-hard', 'D2L']
-COLORS = ['r', 'y', 'c', 'm', 'g', 'b']
-MARKERS = ['x', 'D', '<', '>', '^', 'o']
-
-def complexity_plot(model_list, dataset='mnist', num_classes=10, noise_ratio=10, epochs=50, n_samples=500):
-    """
-    The complexity (Critical Sample Ratio) of the hypothesis learned throughout training.
-    """
-    print('Dataset: %s, epochs: %s, noise ratio: %s%%' % (dataset, epochs, noise_ratio))
-
-    # plot initialization
-    fig = plt.figure()  # figsize=(7, 6)
-    ax = fig.add_subplot(111)
-    bins = np.arange(epochs)
-    xnew = np.arange(0, epochs, 5)
-
-    # load data
-    _, _, X_test, Y_test = get_data(dataset)
-    # convert class vectors to binary class matrices
-    Y_test = to_categorical(Y_test, num_classes)
-
-    shuffle = np.random.permutation(X_test.shape[0])
-    X_test = X_test[shuffle]
-    Y_test = Y_test[shuffle]
-    X_test = X_test[:n_samples]
-    Y_test = Y_test[:n_samples]
-
-    # load model
-    image_shape = X_test.shape[1:]
-    model = get_model(dataset, input_tensor=None, input_shape=image_shape)
-    sgd = SGD(lr=0.01, momentum=0.9)
-    y = tf.placeholder(tf.float32, shape=(None,) + Y_test.shape[1:])
-
-    for model_name in model_list:
-        file_name = "log/crs_%s_%s_%s.npy" % (model_name, dataset, noise_ratio)
-        if os.path.isfile(file_name):
-            crs = np.load(file_name)
-            # plot line
-            idx = MODELS.index(model_name)
-
-            # z = np.polyfit(bins, crs, deg=5)
-            # f = np.poly1d(z)
-            # crs = f(xnew)
-
-            for i in xnew:
-                crs[i] = np.mean(crs[i:i+5])
-
-            crs = crs[xnew]
-
-            ax.plot(xnew, crs, c=COLORS[idx], marker=MARKERS[idx], markersize=3, linewidth=2, label=MODEL_LABELS[idx])
-            continue
-
-        crs = np.zeros(epochs)
-        for i in range(epochs):
-            # the critical sample ratio of the representations learned at every epoch
-            # need to save those epochs first, in this case, use separate folders for each model
-            model_path = 'model/%s/%s_%s.%02d.hdf5' % (model_name, dataset, noise_ratio, i)
-            model.load_weights(model_path)
-            model.compile(
-                loss=cross_entropy,
-                optimizer=sgd,
-                metrics=['accuracy']
-            )
-
-            # LASS to estimate the critical sample ratio
-            scale_factor = 255. / (np.max(X_test) - np.min(X_test))
-            csr_model = lass(model.layers[0].input, model.layers[-1].output, y,
-                             a=0.25 / scale_factor,
-                             b=0.2 / scale_factor,
-                             r=0.3 / scale_factor,
-                             iter_max=100)
-            X_adv, adv_ind = csr_model.find(X_test, bs=500)
-            crs[i] = np.sum(adv_ind) * 1. / n_samples
-
-            print('model: %s, epoch: %s, CRS: %s' % (model_name, i, crs[i]))
-
-        # save result to avoid recomputing
-        np.save(file_name, crs)
-        print(crs)
-
-        # plot line
-        idx = MODELS.index(model_name)
-
-        z = np.polyfit(bins, crs, deg=5)
-        f = np.poly1d(z)
-        crs = f(xnew)
-
-        ax.plot(xnew, crs, c=COLORS[idx], marker=MARKERS[idx], markersize=3, linewidth=2, label=MODEL_LABELS[idx])
-
-    # ax.set_xticks([])
-    # ax.set_yticks([])
-    ax.set_xlabel("Epoch", fontsize=15)
-    ax.set_ylabel("Hypothesis complexity (CSR score)", fontsize=15)
-    # ax.set_title("%s with %s%% noisy labels" % (dataset.upper(), noise_ratio), fontsize=15)
-    legend = plt.legend(loc='upper left')
-    plt.setp(legend.get_texts(), fontsize=15)
-    fig.savefig("plots/complexity_trend_all_models_%s_%s.png" % (dataset, noise_ratio), dpi=300)
-    plt.show()
-
-if __name__ == "__main__":
-    # mnist: epoch=50, cifar-10: epoch=120
-    complexity_plot(model_list=['ce', 'forward', 'backward', 'boot_hard', 'boot_soft', 'd2l'],
-                    dataset='cifar-10', num_classes=10, noise_ratio=60, epochs=120, n_samples=500)
diff --git a/configs/cifar10/asym/bhl.yaml b/configs/cifar10/asym/bhl.yaml
new file mode 100644
index 0000000..e459990
--- /dev/null
+++ b/configs/cifar10/asym/bhl.yaml
@@ -0,0 +1,32 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: BootSoftLoss
+  num_classes: 10
+  beta: 0.8
\ No newline at end of file
diff --git a/configs/cifar10/asym/bl.yaml b/configs/cifar10/asym/bl.yaml
new file mode 100644
index 0000000..959320c
--- /dev/null
+++ b/configs/cifar10/asym/bl.yaml
@@ -0,0 +1,32 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: BackwardLoss
+  num_classes: 10
+  noise_rate: 0
\ No newline at end of file
diff --git a/configs/cifar10/asym/bsl.yaml b/configs/cifar10/asym/bsl.yaml
new file mode 100644
index 0000000..81d30bb
--- /dev/null
+++ b/configs/cifar10/asym/bsl.yaml
@@ -0,0 +1,32 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: BootSoftLoss
+  num_classes: 10
+  beta: 0.95
\ No newline at end of file
diff --git a/configs/cifar10/asym/ce.yaml b/configs/cifar10/asym/ce.yaml
new file mode 100644
index 0000000..a665689
--- /dev/null
+++ b/configs/cifar10/asym/ce.yaml
@@ -0,0 +1,30 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: CrossEntropyLoss
diff --git a/configs/cifar10/asym/d2l.yaml b/configs/cifar10/asym/d2l.yaml
new file mode 100644
index 0000000..875d393
--- /dev/null
+++ b/configs/cifar10/asym/d2l.yaml
@@ -0,0 +1,34 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: LIDPacedLoss
+  num_classes: 10
+  alpha: 1.0
+  beta1: 0.1
+  beta2: 1.0
\ No newline at end of file
diff --git a/configs/cifar10/asym/fl.yaml b/configs/cifar10/asym/fl.yaml
new file mode 100644
index 0000000..a97025d
--- /dev/null
+++ b/configs/cifar10/asym/fl.yaml
@@ -0,0 +1,32 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: ForwardLoss
+  num_classes: 10
+  noise_rate: 0
\ No newline at end of file
diff --git a/configs/cifar10/asym/focal.yaml b/configs/cifar10/asym/focal.yaml
new file mode 100644
index 0000000..c00eb66
--- /dev/null
+++ b/configs/cifar10/asym/focal.yaml
@@ -0,0 +1,31 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: FocalLoss
+  gamma: 0.5
diff --git a/configs/cifar10/asym/gce.yaml b/configs/cifar10/asym/gce.yaml
new file mode 100644
index 0000000..23338cf
--- /dev/null
+++ b/configs/cifar10/asym/gce.yaml
@@ -0,0 +1,32 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: GeneralizedCrossEntropy
+  num_classes: 10
+  q: 0.7
diff --git a/configs/cifar10/asym/mae.yaml b/configs/cifar10/asym/mae.yaml
new file mode 100644
index 0000000..69457cc
--- /dev/null
+++ b/configs/cifar10/asym/mae.yaml
@@ -0,0 +1,32 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: MeanAbsoluteError
+  num_classes: 10
+  scale: 1.0
diff --git a/configs/cifar10/asym/nce+mae.yaml b/configs/cifar10/asym/nce+mae.yaml
new file mode 100644
index 0000000..3d077e5
--- /dev/null
+++ b/configs/cifar10/asym/nce+mae.yaml
@@ -0,0 +1,33 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NCEandMAE
+  num_classes: 10
+  alpha: 1.0
+  beta: 1.0
diff --git a/configs/cifar10/asym/nce+rce.yaml b/configs/cifar10/asym/nce+rce.yaml
new file mode 100644
index 0000000..01a46df
--- /dev/null
+++ b/configs/cifar10/asym/nce+rce.yaml
@@ -0,0 +1,33 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+  
+criterion:
+  name: NCEandRCE
+  num_classes: 10
+  alpha: 1.0
+  beta: 1.0
diff --git a/configs/cifar10/asym/nce.yaml b/configs/cifar10/asym/nce.yaml
new file mode 100644
index 0000000..6734a62
--- /dev/null
+++ b/configs/cifar10/asym/nce.yaml
@@ -0,0 +1,32 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NormalizedCrossEntropy
+  num_classes: 10
+  scale: 1.0
diff --git a/configs/cifar10/asym/nfl+mae.yaml b/configs/cifar10/asym/nfl+mae.yaml
new file mode 100644
index 0000000..d5f723c
--- /dev/null
+++ b/configs/cifar10/asym/nfl+mae.yaml
@@ -0,0 +1,34 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NFLandMAE
+  num_classes: 10
+  gamma: 0.5
+  alpha: 1.0
+  beta: 1.0
diff --git a/configs/cifar10/asym/nfl+rce.yaml b/configs/cifar10/asym/nfl+rce.yaml
new file mode 100644
index 0000000..f2ee4f5
--- /dev/null
+++ b/configs/cifar10/asym/nfl+rce.yaml
@@ -0,0 +1,34 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NFLandRCE
+  num_classes: 10
+  gamma: 0.5
+  alpha: 1.0
+  beta: 1.0
diff --git a/configs/cifar10/asym/nfl.yaml b/configs/cifar10/asym/nfl.yaml
new file mode 100644
index 0000000..5dfad2e
--- /dev/null
+++ b/configs/cifar10/asym/nfl.yaml
@@ -0,0 +1,33 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NormalizedFocalLoss
+  num_classes: 10
+  scale: 1.0
+  gamma: 0.5
diff --git a/configs/cifar10/asym/ngce+mae.yaml b/configs/cifar10/asym/ngce+mae.yaml
new file mode 100644
index 0000000..06cf547
--- /dev/null
+++ b/configs/cifar10/asym/ngce+mae.yaml
@@ -0,0 +1,34 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NGCEandMAE
+  num_classes: 10
+  q: 0.1
+  alpha: 1.0
+  beta: 1.0
diff --git a/configs/cifar10/asym/ngce+rce.yaml b/configs/cifar10/asym/ngce+rce.yaml
new file mode 100644
index 0000000..06cf547
--- /dev/null
+++ b/configs/cifar10/asym/ngce+rce.yaml
@@ -0,0 +1,34 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NGCEandMAE
+  num_classes: 10
+  q: 0.1
+  alpha: 1.0
+  beta: 1.0
diff --git a/configs/cifar10/asym/ngce.yaml b/configs/cifar10/asym/ngce.yaml
new file mode 100644
index 0000000..818891e
--- /dev/null
+++ b/configs/cifar10/asym/ngce.yaml
@@ -0,0 +1,33 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NormalizedGeneralizedCrossEntropy
+  num_classes: 10
+  q: 0.1
+  scale: 1.0
diff --git a/configs/cifar10/asym/nlnl.yaml b/configs/cifar10/asym/nlnl.yaml
new file mode 100644
index 0000000..9f2d4f6
--- /dev/null
+++ b/configs/cifar10/asym/nlnl.yaml
@@ -0,0 +1,32 @@
+epochs: 1000
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NLNL
+  num_classes: 10
+  ln_neg: 1
diff --git a/configs/cifar10/asym/rce.yaml b/configs/cifar10/asym/rce.yaml
new file mode 100644
index 0000000..0adfafc
--- /dev/null
+++ b/configs/cifar10/asym/rce.yaml
@@ -0,0 +1,32 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: ReverseCrossEntropy
+  num_classes: 10
+  scale: 1.0
diff --git a/configs/cifar10/asym/sce.yaml b/configs/cifar10/asym/sce.yaml
new file mode 100644
index 0000000..e1d97f7
--- /dev/null
+++ b/configs/cifar10/asym/sce.yaml
@@ -0,0 +1,33 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: SCELoss
+  num_classes: 10
+  alpha: 0.1
+  beta: 1.0
diff --git a/configs/cifar10/sym/bhl.yaml b/configs/cifar10/sym/bhl.yaml
new file mode 100644
index 0000000..a68e993
--- /dev/null
+++ b/configs/cifar10/sym/bhl.yaml
@@ -0,0 +1,32 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: BootSoftLoss
+  num_classes: 10
+  beta: 0.8
\ No newline at end of file
diff --git a/configs/cifar10/sym/bl.yaml b/configs/cifar10/sym/bl.yaml
new file mode 100644
index 0000000..6a42c82
--- /dev/null
+++ b/configs/cifar10/sym/bl.yaml
@@ -0,0 +1,32 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: BackwardLoss
+  num_classes: 10
+  noise_rate: 0
\ No newline at end of file
diff --git a/configs/cifar10/sym/bsl.yaml b/configs/cifar10/sym/bsl.yaml
new file mode 100644
index 0000000..3402c3b
--- /dev/null
+++ b/configs/cifar10/sym/bsl.yaml
@@ -0,0 +1,32 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: BootSoftLoss
+  num_classes: 10
+  beta: 0.95
\ No newline at end of file
diff --git a/configs/cifar10/sym/ce.yaml b/configs/cifar10/sym/ce.yaml
new file mode 100644
index 0000000..6f45e1d
--- /dev/null
+++ b/configs/cifar10/sym/ce.yaml
@@ -0,0 +1,34 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+#scheduler:
+  #name: CosineAnnealingLR
+  #T_max: $epochs
+  #eta_min: 0.0
+scheduler:
+  name: StepLR
+  step_size: 40
+  gamma: 0.1
+
+criterion:
+  name: CrossEntropyLoss
diff --git a/configs/cifar10/sym/d2l.yaml b/configs/cifar10/sym/d2l.yaml
new file mode 100644
index 0000000..4906968
--- /dev/null
+++ b/configs/cifar10/sym/d2l.yaml
@@ -0,0 +1,34 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: LIDPacedLoss
+  num_classes: 10
+  alpha: 1.0
+  beta1: 0.1
+  beta2: 1.0
\ No newline at end of file
diff --git a/configs/cifar10/sym/fl.yaml b/configs/cifar10/sym/fl.yaml
new file mode 100644
index 0000000..e51103f
--- /dev/null
+++ b/configs/cifar10/sym/fl.yaml
@@ -0,0 +1,32 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: ForwardLoss
+  num_classes: 10
+  noise_rate: 0
\ No newline at end of file
diff --git a/configs/cifar10/sym/focal.yaml b/configs/cifar10/sym/focal.yaml
new file mode 100644
index 0000000..62425fd
--- /dev/null
+++ b/configs/cifar10/sym/focal.yaml
@@ -0,0 +1,31 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: FocalLoss
+  gamma: 0.5
diff --git a/configs/cifar10/sym/gce.yaml b/configs/cifar10/sym/gce.yaml
new file mode 100644
index 0000000..62420e3
--- /dev/null
+++ b/configs/cifar10/sym/gce.yaml
@@ -0,0 +1,32 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: GeneralizedCrossEntropy
+  num_classes: 10
+  q: 0.7
diff --git a/configs/cifar10/sym/mae.yaml b/configs/cifar10/sym/mae.yaml
new file mode 100644
index 0000000..bb7e561
--- /dev/null
+++ b/configs/cifar10/sym/mae.yaml
@@ -0,0 +1,32 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: MeanAbsoluteError
+  num_classes: 10
+  scale: 1.0
diff --git a/configs/cifar10/sym/nce+mae.yaml b/configs/cifar10/sym/nce+mae.yaml
new file mode 100644
index 0000000..0ccc85f
--- /dev/null
+++ b/configs/cifar10/sym/nce+mae.yaml
@@ -0,0 +1,33 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NCEandMAE
+  num_classes: 10
+  alpha: 1.0
+  beta: 1.0
diff --git a/configs/cifar10/sym/nce+rce.yaml b/configs/cifar10/sym/nce+rce.yaml
new file mode 100644
index 0000000..4c58521
--- /dev/null
+++ b/configs/cifar10/sym/nce+rce.yaml
@@ -0,0 +1,33 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NCEandRCE
+  num_classes: 10
+  alpha: 1.0
+  beta: 1.0
diff --git a/configs/cifar10/sym/nce.yaml b/configs/cifar10/sym/nce.yaml
new file mode 100644
index 0000000..d115054
--- /dev/null
+++ b/configs/cifar10/sym/nce.yaml
@@ -0,0 +1,32 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NormalizedCrossEntropy
+  num_classes: 10
+  scale: 1.0
diff --git a/configs/cifar10/sym/nfl+mae.yaml b/configs/cifar10/sym/nfl+mae.yaml
new file mode 100644
index 0000000..59b6ff8
--- /dev/null
+++ b/configs/cifar10/sym/nfl+mae.yaml
@@ -0,0 +1,34 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NFLandMAE
+  num_classes: 10
+  gamma: 0.5
+  alpha: 1.0
+  beta: 1.0
diff --git a/configs/cifar10/sym/nfl+rce.yaml b/configs/cifar10/sym/nfl+rce.yaml
new file mode 100644
index 0000000..8b5c97b
--- /dev/null
+++ b/configs/cifar10/sym/nfl+rce.yaml
@@ -0,0 +1,34 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NFLandRCE
+  num_classes: 10
+  gamma: 0.5
+  alpha: 1.0
+  beta: 1.0
diff --git a/configs/cifar10/sym/nfl.yaml b/configs/cifar10/sym/nfl.yaml
new file mode 100644
index 0000000..3d5a934
--- /dev/null
+++ b/configs/cifar10/sym/nfl.yaml
@@ -0,0 +1,33 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NormalizedFocalLoss
+  num_classes: 10
+  scale: 10.0
+  gamma: 0.5
diff --git a/configs/cifar10/sym/ngce+mae.yaml b/configs/cifar10/sym/ngce+mae.yaml
new file mode 100644
index 0000000..eaf3bfd
--- /dev/null
+++ b/configs/cifar10/sym/ngce+mae.yaml
@@ -0,0 +1,34 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NGCEandMAE
+  num_classes: 10
+  q: 0.1
+  alpha: 1.0
+  beta: 1.0
diff --git a/configs/cifar10/sym/ngce+rce.yaml b/configs/cifar10/sym/ngce+rce.yaml
new file mode 100644
index 0000000..8f021d8
--- /dev/null
+++ b/configs/cifar10/sym/ngce+rce.yaml
@@ -0,0 +1,34 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NGCEandRCE
+  num_classes: 10
+  q: 0.1
+  alpha: 1.0
+  beta: 1.0
diff --git a/configs/cifar10/sym/ngce.yaml b/configs/cifar10/sym/ngce.yaml
new file mode 100644
index 0000000..ac42d1c
--- /dev/null
+++ b/configs/cifar10/sym/ngce.yaml
@@ -0,0 +1,33 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NormalizedGeneralizedCrossEntropy
+  num_classes: 10
+  scale: 1.0
+  q: 0.1
diff --git a/configs/cifar10/sym/nlnl.yaml b/configs/cifar10/sym/nlnl.yaml
new file mode 100644
index 0000000..bb0d8b2
--- /dev/null
+++ b/configs/cifar10/sym/nlnl.yaml
@@ -0,0 +1,32 @@
+epochs: 1000
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NLNL
+  num_classes: 10
+  ln_neg: 1
diff --git a/configs/cifar10/sym/rce.yaml b/configs/cifar10/sym/rce.yaml
new file mode 100644
index 0000000..41dd3f3
--- /dev/null
+++ b/configs/cifar10/sym/rce.yaml
@@ -0,0 +1,32 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: ReverseCrossEntropy
+  num_classes: 10
+  scale: 1.0
diff --git a/configs/cifar10/sym/sce.yaml b/configs/cifar10/sym/sce.yaml
new file mode 100644
index 0000000..87a29a4
--- /dev/null
+++ b/configs/cifar10/sym/sce.yaml
@@ -0,0 +1,33 @@
+epochs: 120
+grad_bound: 5.0
+log_frequency: 100
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 128
+  data_path: ../datasets/
+  dataset_type: 'CIFAR10'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: SCELoss
+  num_classes: 10
+  alpha: 0.1
+  beta: 1.0
diff --git a/configs/cifar100/asym/bhl.yaml b/configs/cifar100/asym/bhl.yaml
new file mode 100644
index 0000000..dbbdd89
--- /dev/null
+++ b/configs/cifar100/asym/bhl.yaml
@@ -0,0 +1,32 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: BootSoftLoss
+  num_classes: 100
+  beta: 0.8
\ No newline at end of file
diff --git a/configs/cifar100/asym/bl.yaml b/configs/cifar100/asym/bl.yaml
new file mode 100644
index 0000000..0c78bbe
--- /dev/null
+++ b/configs/cifar100/asym/bl.yaml
@@ -0,0 +1,32 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: BackwardLoss
+  num_classes: 100
+  noise_rate: 0
\ No newline at end of file
diff --git a/configs/cifar100/asym/bsl.yaml b/configs/cifar100/asym/bsl.yaml
new file mode 100644
index 0000000..faad8ce
--- /dev/null
+++ b/configs/cifar100/asym/bsl.yaml
@@ -0,0 +1,32 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: BootSoftLoss
+  num_classes: 100
+  beta: 0.95
\ No newline at end of file
diff --git a/configs/cifar100/asym/ce.yaml b/configs/cifar100/asym/ce.yaml
new file mode 100644
index 0000000..e99f3b4
--- /dev/null
+++ b/configs/cifar100/asym/ce.yaml
@@ -0,0 +1,30 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: CrossEntropyLoss
diff --git a/configs/cifar100/asym/d2l.yaml b/configs/cifar100/asym/d2l.yaml
new file mode 100644
index 0000000..7cb0e40
--- /dev/null
+++ b/configs/cifar100/asym/d2l.yaml
@@ -0,0 +1,34 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: LIDPacedLoss
+  num_classes: 100
+  alpha: 1.0
+  beta1: 0.1
+  beta2: 1.0
\ No newline at end of file
diff --git a/configs/cifar100/asym/fl.yaml b/configs/cifar100/asym/fl.yaml
new file mode 100644
index 0000000..f827587
--- /dev/null
+++ b/configs/cifar100/asym/fl.yaml
@@ -0,0 +1,32 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: ForwardLoss
+  num_classes: 100
+  noise_rate: 0
\ No newline at end of file
diff --git a/configs/cifar100/asym/focal.yaml b/configs/cifar100/asym/focal.yaml
new file mode 100644
index 0000000..a836fa7
--- /dev/null
+++ b/configs/cifar100/asym/focal.yaml
@@ -0,0 +1,31 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: FocalLoss
+  gamma: 0.5
diff --git a/configs/cifar100/asym/gce.yaml b/configs/cifar100/asym/gce.yaml
new file mode 100644
index 0000000..f6cffc5
--- /dev/null
+++ b/configs/cifar100/asym/gce.yaml
@@ -0,0 +1,32 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: GeneralizedCrossEntropy
+  num_classes: 100
+  q: 0.7
diff --git a/configs/cifar100/asym/mae.yaml b/configs/cifar100/asym/mae.yaml
new file mode 100644
index 0000000..4a46a93
--- /dev/null
+++ b/configs/cifar100/asym/mae.yaml
@@ -0,0 +1,32 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: MeanAbsoluteError
+  num_classes: 100
+  scale: 1.0
diff --git a/configs/cifar100/asym/nce+mae.yaml b/configs/cifar100/asym/nce+mae.yaml
new file mode 100644
index 0000000..8c28957
--- /dev/null
+++ b/configs/cifar100/asym/nce+mae.yaml
@@ -0,0 +1,33 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NCEandMAE
+  num_classes: 100
+  alpha: 10.0
+  beta: 1.0
diff --git a/configs/cifar100/asym/nce+rce.yaml b/configs/cifar100/asym/nce+rce.yaml
new file mode 100644
index 0000000..004a76b
--- /dev/null
+++ b/configs/cifar100/asym/nce+rce.yaml
@@ -0,0 +1,33 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NCEandRCE
+  num_classes: 100
+  alpha: 10.0
+  beta: 0.1
diff --git a/configs/cifar100/asym/nce.yaml b/configs/cifar100/asym/nce.yaml
new file mode 100644
index 0000000..a54b4c5
--- /dev/null
+++ b/configs/cifar100/asym/nce.yaml
@@ -0,0 +1,32 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NormalizedCrossEntropy
+  num_classes: 100
+  scale: 1.0
diff --git a/configs/cifar100/asym/nfl+mae.yaml b/configs/cifar100/asym/nfl+mae.yaml
new file mode 100644
index 0000000..acf631d
--- /dev/null
+++ b/configs/cifar100/asym/nfl+mae.yaml
@@ -0,0 +1,34 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NFLandMAE
+  num_classes: 100
+  gamma: 0.5
+  alpha: 10.0
+  beta: 1.0
diff --git a/configs/cifar100/asym/nfl+rce.yaml b/configs/cifar100/asym/nfl+rce.yaml
new file mode 100644
index 0000000..484522b
--- /dev/null
+++ b/configs/cifar100/asym/nfl+rce.yaml
@@ -0,0 +1,34 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NFLandRCE
+  num_classes: 100
+  gamma: 0.5
+  alpha: 10.0
+  beta: 0.1
diff --git a/configs/cifar100/asym/nfl.yaml b/configs/cifar100/asym/nfl.yaml
new file mode 100644
index 0000000..8ee3b92
--- /dev/null
+++ b/configs/cifar100/asym/nfl.yaml
@@ -0,0 +1,33 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NormalizedFocalLoss
+  num_classes: 100
+  scale: 1.0
+  gamma: 0.5
diff --git a/configs/cifar100/asym/ngce+mae.yaml b/configs/cifar100/asym/ngce+mae.yaml
new file mode 100644
index 0000000..838e0e4
--- /dev/null
+++ b/configs/cifar100/asym/ngce+mae.yaml
@@ -0,0 +1,34 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NGCEandMAE
+  num_classes: 100
+  q: 0.7
+  alpha: 10.0
+  beta: 1.0
diff --git a/configs/cifar100/asym/ngce+rce.yaml b/configs/cifar100/asym/ngce+rce.yaml
new file mode 100644
index 0000000..a35c906
--- /dev/null
+++ b/configs/cifar100/asym/ngce+rce.yaml
@@ -0,0 +1,34 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NGCEandMAE
+  num_classes: 100
+  q: 0.7
+  alpha: 10.0
+  beta: 0.1
diff --git a/configs/cifar100/asym/ngce.yaml b/configs/cifar100/asym/ngce.yaml
new file mode 100644
index 0000000..63d3ef6
--- /dev/null
+++ b/configs/cifar100/asym/ngce.yaml
@@ -0,0 +1,33 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NormalizedGeneralizedCrossEntropy
+  num_classes: 100
+  scale: 1.0
+  q: 0.7
diff --git a/configs/cifar100/asym/nlnl.yaml b/configs/cifar100/asym/nlnl.yaml
new file mode 100644
index 0000000..1946dd5
--- /dev/null
+++ b/configs/cifar100/asym/nlnl.yaml
@@ -0,0 +1,32 @@
+epochs: 2000
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NLNL
+  num_classes: 100
+  ln_neg: 110
diff --git a/configs/cifar100/asym/rce.yaml b/configs/cifar100/asym/rce.yaml
new file mode 100644
index 0000000..4cc8114
--- /dev/null
+++ b/configs/cifar100/asym/rce.yaml
@@ -0,0 +1,32 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: ReverseCrossEntropy
+  num_classes: 100
+  scale: 1.0
diff --git a/configs/cifar100/asym/sce.yaml b/configs/cifar100/asym/sce.yaml
new file mode 100644
index 0000000..005e078
--- /dev/null
+++ b/configs/cifar100/asym/sce.yaml
@@ -0,0 +1,33 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: SCELoss
+  num_classes: 100
+  alpha: 6.0
+  beta: 0.1
diff --git a/configs/cifar100/sym/bhl.yaml b/configs/cifar100/sym/bhl.yaml
new file mode 100644
index 0000000..8f73ef3
--- /dev/null
+++ b/configs/cifar100/sym/bhl.yaml
@@ -0,0 +1,32 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: BootSoftLoss
+  num_classes: 100
+  beta: 0.8
\ No newline at end of file
diff --git a/configs/cifar100/sym/bl.yaml b/configs/cifar100/sym/bl.yaml
new file mode 100644
index 0000000..3ce7c37
--- /dev/null
+++ b/configs/cifar100/sym/bl.yaml
@@ -0,0 +1,32 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: BackwardLoss
+  num_classes: 100
+  noise_rate: 0
\ No newline at end of file
diff --git a/configs/cifar100/sym/bsl.yaml b/configs/cifar100/sym/bsl.yaml
new file mode 100644
index 0000000..346c324
--- /dev/null
+++ b/configs/cifar100/sym/bsl.yaml
@@ -0,0 +1,32 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: BootSoftLoss
+  num_classes: 100
+  beta: 0.95
\ No newline at end of file
diff --git a/configs/cifar100/sym/ce.yaml b/configs/cifar100/sym/ce.yaml
new file mode 100644
index 0000000..ece8d22
--- /dev/null
+++ b/configs/cifar100/sym/ce.yaml
@@ -0,0 +1,30 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: CrossEntropyLoss
diff --git a/configs/cifar100/sym/d2l.yaml b/configs/cifar100/sym/d2l.yaml
new file mode 100644
index 0000000..167f494
--- /dev/null
+++ b/configs/cifar100/sym/d2l.yaml
@@ -0,0 +1,34 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: LIDPacedLoss
+  num_classes: 100
+  alpha: 1.0
+  beta1: 0.1
+  beta2: 1.0
\ No newline at end of file
diff --git a/configs/cifar100/sym/fl.yaml b/configs/cifar100/sym/fl.yaml
new file mode 100644
index 0000000..9261c73
--- /dev/null
+++ b/configs/cifar100/sym/fl.yaml
@@ -0,0 +1,32 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: ForwardLoss
+  num_classes: 100
+  noise_rate: 0
\ No newline at end of file
diff --git a/configs/cifar100/sym/focal.yaml b/configs/cifar100/sym/focal.yaml
new file mode 100644
index 0000000..d7724f9
--- /dev/null
+++ b/configs/cifar100/sym/focal.yaml
@@ -0,0 +1,31 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: FocalLoss
+  gamma: 0.5
diff --git a/configs/cifar100/sym/gce.yaml b/configs/cifar100/sym/gce.yaml
new file mode 100644
index 0000000..0182704
--- /dev/null
+++ b/configs/cifar100/sym/gce.yaml
@@ -0,0 +1,32 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: GeneralizedCrossEntropy
+  num_classes: 100
+  q: 0.7
diff --git a/configs/cifar100/sym/mae.yaml b/configs/cifar100/sym/mae.yaml
new file mode 100644
index 0000000..453c535
--- /dev/null
+++ b/configs/cifar100/sym/mae.yaml
@@ -0,0 +1,32 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: MeanAbsoluteError
+  num_classes: 100
+  scale: 1.0
diff --git a/configs/cifar100/sym/nce+mae.yaml b/configs/cifar100/sym/nce+mae.yaml
new file mode 100644
index 0000000..f78c570
--- /dev/null
+++ b/configs/cifar100/sym/nce+mae.yaml
@@ -0,0 +1,33 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NCEandMAE
+  num_classes: 100
+  alpha: 10.0
+  beta: 1.0
diff --git a/configs/cifar100/sym/nce+rce.yaml b/configs/cifar100/sym/nce+rce.yaml
new file mode 100644
index 0000000..02bf826
--- /dev/null
+++ b/configs/cifar100/sym/nce+rce.yaml
@@ -0,0 +1,33 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NCEandRCE
+  num_classes: 100
+  alpha: 10.0
+  beta: 0.1
diff --git a/configs/cifar100/sym/nce.yaml b/configs/cifar100/sym/nce.yaml
new file mode 100644
index 0000000..23d4724
--- /dev/null
+++ b/configs/cifar100/sym/nce.yaml
@@ -0,0 +1,32 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NormalizedCrossEntropy
+  num_classes: 100
+  scale: 1.0
diff --git a/configs/cifar100/sym/nfl+mae.yaml b/configs/cifar100/sym/nfl+mae.yaml
new file mode 100644
index 0000000..e6fb00c
--- /dev/null
+++ b/configs/cifar100/sym/nfl+mae.yaml
@@ -0,0 +1,34 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NFLandMAE
+  num_classes: 100
+  gamma: 0.5
+  alpha: 10.0
+  beta: 1.0
diff --git a/configs/cifar100/sym/nfl+rce.yaml b/configs/cifar100/sym/nfl+rce.yaml
new file mode 100644
index 0000000..86eca40
--- /dev/null
+++ b/configs/cifar100/sym/nfl+rce.yaml
@@ -0,0 +1,34 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NFLandRCE
+  num_classes: 100
+  gamma: 0.5
+  alpha: 10.0
+  beta: 0.1
diff --git a/configs/cifar100/sym/nfl.yaml b/configs/cifar100/sym/nfl.yaml
new file mode 100644
index 0000000..67af0a2
--- /dev/null
+++ b/configs/cifar100/sym/nfl.yaml
@@ -0,0 +1,33 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NormalizedFocalLoss
+  num_classes: 100
+  scale: 1.0
+  gamma: 0.5
diff --git a/configs/cifar100/sym/ngce+mae.yaml b/configs/cifar100/sym/ngce+mae.yaml
new file mode 100644
index 0000000..26ca360
--- /dev/null
+++ b/configs/cifar100/sym/ngce+mae.yaml
@@ -0,0 +1,34 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NGCEandMAE
+  num_classes: 100
+  q: 0.7
+  alpha: 10.0
+  beta: 1.0
diff --git a/configs/cifar100/sym/ngce+rce.yaml b/configs/cifar100/sym/ngce+rce.yaml
new file mode 100644
index 0000000..e1907de
--- /dev/null
+++ b/configs/cifar100/sym/ngce+rce.yaml
@@ -0,0 +1,34 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NGCEandMAE
+  num_classes: 100
+  q: 0.7
+  alpha: 10.0
+  beta: 0.1
diff --git a/configs/cifar100/sym/ngce.yaml b/configs/cifar100/sym/ngce.yaml
new file mode 100644
index 0000000..cfc67f8
--- /dev/null
+++ b/configs/cifar100/sym/ngce.yaml
@@ -0,0 +1,33 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NormalizedGeneralizedCrossEntropy
+  num_classes: 100
+  scale: 1.0
+  q: 0.7
diff --git a/configs/cifar100/sym/nlnl.yaml b/configs/cifar100/sym/nlnl.yaml
new file mode 100644
index 0000000..00729a2
--- /dev/null
+++ b/configs/cifar100/sym/nlnl.yaml
@@ -0,0 +1,32 @@
+epochs: 2000
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: NLNL
+  num_classes: 100
+  ln_neg: 110
diff --git a/configs/cifar100/sym/rce.yaml b/configs/cifar100/sym/rce.yaml
new file mode 100644
index 0000000..48d1f56
--- /dev/null
+++ b/configs/cifar100/sym/rce.yaml
@@ -0,0 +1,32 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: ReverseCrossEntropy
+  num_classes: 100
+  scale: 1.0
diff --git a/configs/cifar100/sym/sce.yaml b/configs/cifar100/sym/sce.yaml
new file mode 100644
index 0000000..f54b9d9
--- /dev/null
+++ b/configs/cifar100/sym/sce.yaml
@@ -0,0 +1,33 @@
+epochs: 200
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 256
+  data_path: ../datasets/
+  dataset_type: 'CIFAR100'
+  num_of_workers: 8
+
+model:
+  name: ResNet50
+  num_classes: 100
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-5
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.0
+
+criterion:
+  name: SCELoss
+  num_classes: 100
+  alpha: 6.0
+  beta: 0.1
diff --git a/configs/mnist/asym/bhl.yaml b/configs/mnist/asym/bhl.yaml
new file mode 100644
index 0000000..e12a278
--- /dev/null
+++ b/configs/mnist/asym/bhl.yaml
@@ -0,0 +1,33 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets/
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: BootSoftLoss
+  num_classes: 10
+  beta: 0.8
\ No newline at end of file
diff --git a/configs/mnist/asym/bl.yaml b/configs/mnist/asym/bl.yaml
new file mode 100644
index 0000000..92fb2ac
--- /dev/null
+++ b/configs/mnist/asym/bl.yaml
@@ -0,0 +1,32 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 51
+  data_path: ../datasets/
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: BackwardLoss
+  num_classes: 10
+  noise_rate: 0
\ No newline at end of file
diff --git a/configs/mnist/asym/bsl.yaml b/configs/mnist/asym/bsl.yaml
new file mode 100644
index 0000000..fc2aa20
--- /dev/null
+++ b/configs/mnist/asym/bsl.yaml
@@ -0,0 +1,32 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets/
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: BootSoftLoss
+  num_classes: 10
+  beta: 0.95
\ No newline at end of file
diff --git a/configs/mnist/asym/ce.yaml b/configs/mnist/asym/ce.yaml
new file mode 100644
index 0000000..1984309
--- /dev/null
+++ b/configs/mnist/asym/ce.yaml
@@ -0,0 +1,31 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: CrossEntropyLoss
diff --git a/configs/mnist/asym/d2l.yaml b/configs/mnist/asym/d2l.yaml
new file mode 100644
index 0000000..1bf7aac
--- /dev/null
+++ b/configs/mnist/asym/d2l.yaml
@@ -0,0 +1,34 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets/
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: LIDPacedLoss
+  num_classes: 10
+  alpha: 1.0
+  beta1: 0.1
+  beta2: 1.0
\ No newline at end of file
diff --git a/configs/mnist/asym/fl.yaml b/configs/mnist/asym/fl.yaml
new file mode 100644
index 0000000..2f28524
--- /dev/null
+++ b/configs/mnist/asym/fl.yaml
@@ -0,0 +1,32 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets/
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: ForwardLoss
+  num_classes: 10
+  noise_rate: 0
\ No newline at end of file
diff --git a/configs/mnist/asym/focal.yaml b/configs/mnist/asym/focal.yaml
new file mode 100644
index 0000000..666c446
--- /dev/null
+++ b/configs/mnist/asym/focal.yaml
@@ -0,0 +1,32 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: FocalLoss
+  gamma: 0.5
diff --git a/configs/mnist/asym/gce.yaml b/configs/mnist/asym/gce.yaml
new file mode 100644
index 0000000..edc4d35
--- /dev/null
+++ b/configs/mnist/asym/gce.yaml
@@ -0,0 +1,33 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: GeneralizedCrossEntropy
+  num_classes: 10
+  q: 0.7
diff --git a/configs/mnist/asym/mae.yaml b/configs/mnist/asym/mae.yaml
new file mode 100644
index 0000000..e5762a1
--- /dev/null
+++ b/configs/mnist/asym/mae.yaml
@@ -0,0 +1,33 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: MeanAbsoluteError
+  num_classes: 10
+  scale: 1.0
diff --git a/configs/mnist/asym/nce+mae.yaml b/configs/mnist/asym/nce+mae.yaml
new file mode 100644
index 0000000..5ee86e6
--- /dev/null
+++ b/configs/mnist/asym/nce+mae.yaml
@@ -0,0 +1,34 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: NCEandMAE
+  num_classes: 10
+  alpha: 1.0
+  beta: 10.0
diff --git a/configs/mnist/asym/nce+rce.yaml b/configs/mnist/asym/nce+rce.yaml
new file mode 100644
index 0000000..f72ccfa
--- /dev/null
+++ b/configs/mnist/asym/nce+rce.yaml
@@ -0,0 +1,34 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: NCEandRCE
+  num_classes: 10
+  alpha: 1.0
+  beta: 10.0
diff --git a/configs/mnist/asym/nce.yaml b/configs/mnist/asym/nce.yaml
new file mode 100644
index 0000000..5fd6d38
--- /dev/null
+++ b/configs/mnist/asym/nce.yaml
@@ -0,0 +1,33 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: NormalizedCrossEntropy
+  num_classes: 10
+  scale: 10.0
diff --git a/configs/mnist/asym/nfl+mae.yaml b/configs/mnist/asym/nfl+mae.yaml
new file mode 100644
index 0000000..9d5e6af
--- /dev/null
+++ b/configs/mnist/asym/nfl+mae.yaml
@@ -0,0 +1,35 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: NFLandMAE
+  num_classes: 10
+  gamma: 0.5
+  alpha: 1.0
+  beta: 10.0
diff --git a/configs/mnist/asym/nfl+rce.yaml b/configs/mnist/asym/nfl+rce.yaml
new file mode 100644
index 0000000..a5b20c2
--- /dev/null
+++ b/configs/mnist/asym/nfl+rce.yaml
@@ -0,0 +1,35 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: NFLandRCE
+  num_classes: 10
+  gamma: 0.5
+  alpha: 1.0
+  beta: 10.0
diff --git a/configs/mnist/asym/nfl.yaml b/configs/mnist/asym/nfl.yaml
new file mode 100644
index 0000000..480e034
--- /dev/null
+++ b/configs/mnist/asym/nfl.yaml
@@ -0,0 +1,34 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: NormalizedFocalLoss
+  num_classes: 10
+  scale: 1.0
+  gamma: 0.5
diff --git a/configs/mnist/asym/ngce+mae.yaml b/configs/mnist/asym/ngce+mae.yaml
new file mode 100644
index 0000000..c959118
--- /dev/null
+++ b/configs/mnist/asym/ngce+mae.yaml
@@ -0,0 +1,35 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: NGCEandMAE
+  num_classes: 10
+  q: 0.1
+  alpha: 1.0
+  beta: 10.0
diff --git a/configs/mnist/asym/ngce+rce.yaml b/configs/mnist/asym/ngce+rce.yaml
new file mode 100644
index 0000000..71c983a
--- /dev/null
+++ b/configs/mnist/asym/ngce+rce.yaml
@@ -0,0 +1,35 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: NGCEandRCE
+  num_classes: 10
+  q: 0.1
+  alpha: 1.0
+  beta: 10.0
diff --git a/configs/mnist/asym/ngce.yaml b/configs/mnist/asym/ngce.yaml
new file mode 100644
index 0000000..f63ddff
--- /dev/null
+++ b/configs/mnist/asym/ngce.yaml
@@ -0,0 +1,34 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: NormalizedGeneralizedCrossEntropy
+  num_classes: 10
+  scale: 1.0
+  q: 0.1
diff --git a/configs/mnist/asym/nlnl.yaml b/configs/mnist/asym/nlnl.yaml
new file mode 100644
index 0000000..7bda8da
--- /dev/null
+++ b/configs/mnist/asym/nlnl.yaml
@@ -0,0 +1,33 @@
+epochs: 720
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-3
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: NLNL
+  num_classes: 10
+  ln_neg: 1
diff --git a/configs/mnist/asym/rce.yaml b/configs/mnist/asym/rce.yaml
new file mode 100644
index 0000000..71f6b2a
--- /dev/null
+++ b/configs/mnist/asym/rce.yaml
@@ -0,0 +1,33 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: ReverseCrossEntropy
+  num_classes: 10
+  scale: 1.0
diff --git a/configs/mnist/asym/sce.yaml b/configs/mnist/asym/sce.yaml
new file mode 100644
index 0000000..1040027
--- /dev/null
+++ b/configs/mnist/asym/sce.yaml
@@ -0,0 +1,34 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: True
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: SCELoss
+  alpha: 0.01
+  beta: 1.0
+  num_classes: 10
diff --git a/configs/mnist/sym/bhl.yaml b/configs/mnist/sym/bhl.yaml
new file mode 100644
index 0000000..487ec01
--- /dev/null
+++ b/configs/mnist/sym/bhl.yaml
@@ -0,0 +1,33 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets/
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: BootSoftLoss
+  num_classes: 10
+  beta: 0.8
\ No newline at end of file
diff --git a/configs/mnist/sym/bl.yaml b/configs/mnist/sym/bl.yaml
new file mode 100644
index 0000000..b98061b
--- /dev/null
+++ b/configs/mnist/sym/bl.yaml
@@ -0,0 +1,32 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 51
+  data_path: ../datasets/
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: BackwardLoss
+  num_classes: 10
+  noise_rate: 0
\ No newline at end of file
diff --git a/configs/mnist/sym/bsl.yaml b/configs/mnist/sym/bsl.yaml
new file mode 100644
index 0000000..5f00f06
--- /dev/null
+++ b/configs/mnist/sym/bsl.yaml
@@ -0,0 +1,32 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets/
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: BootSoftLoss
+  num_classes: 10
+  beta: 0.95
\ No newline at end of file
diff --git a/configs/mnist/sym/ce.yaml b/configs/mnist/sym/ce.yaml
new file mode 100644
index 0000000..953817b
--- /dev/null
+++ b/configs/mnist/sym/ce.yaml
@@ -0,0 +1,35 @@
+epochs: 1000
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  #weight_decay: 1.e-4
+  momentum: 0.9
+  #nesterov: True
+
+#scheduler:
+  #name: CosineAnnealingLR
+  #T_max: $epochs
+  #eta_min: 0.001
+scheduler:
+  name: StepLR
+  step_size: 100
+  gamma: 0.7
+
+criterion:
+  name: CrossEntropyLoss
diff --git a/configs/mnist/sym/d2l.yaml b/configs/mnist/sym/d2l.yaml
new file mode 100644
index 0000000..ba340b5
--- /dev/null
+++ b/configs/mnist/sym/d2l.yaml
@@ -0,0 +1,39 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets/
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.1
+  weight_decay: 1.e-4
+  momentum: 0.9
+
+#scheduler:
+  #name: CosineAnnealingLR
+  #T_max: $epochs
+  #eta_min: 0.001
+
+scheduler:
+  name: StepLR
+  step_size: 20
+  gamma: 0.1
+
+criterion:
+  name: LIDPacedLoss
+  num_classes: 10
+  alpha: 1.0
+  beta1: 0.1
+  beta2: 1.0
\ No newline at end of file
diff --git a/configs/mnist/sym/fl.yaml b/configs/mnist/sym/fl.yaml
new file mode 100644
index 0000000..97f60af
--- /dev/null
+++ b/configs/mnist/sym/fl.yaml
@@ -0,0 +1,32 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets/
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: ForwardLoss
+  num_classes: 10
+  noise_rate: 0
\ No newline at end of file
diff --git a/configs/mnist/sym/focal.yaml b/configs/mnist/sym/focal.yaml
new file mode 100644
index 0000000..eb9de2a
--- /dev/null
+++ b/configs/mnist/sym/focal.yaml
@@ -0,0 +1,32 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: FocalLoss
+  gamma: 0.5
diff --git a/configs/mnist/sym/gce.yaml b/configs/mnist/sym/gce.yaml
new file mode 100644
index 0000000..23d4c02
--- /dev/null
+++ b/configs/mnist/sym/gce.yaml
@@ -0,0 +1,33 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: GeneralizedCrossEntropy
+  num_classes: 10
+  q: 0.7
diff --git a/configs/mnist/sym/mae.yaml b/configs/mnist/sym/mae.yaml
new file mode 100644
index 0000000..2b455ce
--- /dev/null
+++ b/configs/mnist/sym/mae.yaml
@@ -0,0 +1,33 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: MeanAbsoluteError
+  num_classes: 10
+  scale: 1.0
diff --git a/configs/mnist/sym/nce+mae.yaml b/configs/mnist/sym/nce+mae.yaml
new file mode 100644
index 0000000..e7c7ee0
--- /dev/null
+++ b/configs/mnist/sym/nce+mae.yaml
@@ -0,0 +1,34 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: NCEandMAE
+  num_classes: 10
+  alpha: 1.0
+  beta: 10.0
diff --git a/configs/mnist/sym/nce+rce.yaml b/configs/mnist/sym/nce+rce.yaml
new file mode 100644
index 0000000..547ec5f
--- /dev/null
+++ b/configs/mnist/sym/nce+rce.yaml
@@ -0,0 +1,34 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: NCEandRCE
+  num_classes: 10
+  alpha: 1.0
+  beta: 10.0
diff --git a/configs/mnist/sym/nce.yaml b/configs/mnist/sym/nce.yaml
new file mode 100644
index 0000000..7230460
--- /dev/null
+++ b/configs/mnist/sym/nce.yaml
@@ -0,0 +1,33 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: NormalizedCrossEntropy
+  num_classes: 10
+  scale: 10.0
diff --git a/configs/mnist/sym/nfl+mae.yaml b/configs/mnist/sym/nfl+mae.yaml
new file mode 100644
index 0000000..76c7824
--- /dev/null
+++ b/configs/mnist/sym/nfl+mae.yaml
@@ -0,0 +1,35 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: NFLandMAE
+  num_classes: 10
+  gamma: 0.5
+  alpha: 1.0
+  beta: 10.0
diff --git a/configs/mnist/sym/nfl+rce.yaml b/configs/mnist/sym/nfl+rce.yaml
new file mode 100644
index 0000000..cda7b56
--- /dev/null
+++ b/configs/mnist/sym/nfl+rce.yaml
@@ -0,0 +1,35 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: NFLandRCE
+  num_classes: 10
+  gamma: 0.5
+  alpha: 1.0
+  beta: 10.0
diff --git a/configs/mnist/sym/nfl.yaml b/configs/mnist/sym/nfl.yaml
new file mode 100644
index 0000000..3622737
--- /dev/null
+++ b/configs/mnist/sym/nfl.yaml
@@ -0,0 +1,34 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: NormalizedFocalLoss
+  num_classes: 10
+  scale: 1.0
+  gamma: 0.5
diff --git a/configs/mnist/sym/ngce+mae.yaml b/configs/mnist/sym/ngce+mae.yaml
new file mode 100644
index 0000000..265016f
--- /dev/null
+++ b/configs/mnist/sym/ngce+mae.yaml
@@ -0,0 +1,35 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: NGCEandMAE
+  num_classes: 10
+  q: 0.1
+  alpha: 1.0
+  beta: 10.0
diff --git a/configs/mnist/sym/ngce+rce.yaml b/configs/mnist/sym/ngce+rce.yaml
new file mode 100644
index 0000000..ac29154
--- /dev/null
+++ b/configs/mnist/sym/ngce+rce.yaml
@@ -0,0 +1,35 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: NGCEandRCE
+  num_classes: 10
+  q: 0.1
+  alpha: 1.0
+  beta: 10.0
diff --git a/configs/mnist/sym/ngce.yaml b/configs/mnist/sym/ngce.yaml
new file mode 100644
index 0000000..4dfbca4
--- /dev/null
+++ b/configs/mnist/sym/ngce.yaml
@@ -0,0 +1,34 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: NormalizedGeneralizedCrossEntropy
+  num_classes: 10
+  scale: 10.0
+  q: 0.1
diff --git a/configs/mnist/sym/nlnl.yaml b/configs/mnist/sym/nlnl.yaml
new file mode 100644
index 0000000..269d7fc
--- /dev/null
+++ b/configs/mnist/sym/nlnl.yaml
@@ -0,0 +1,33 @@
+epochs: 720
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-3
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: NLNL
+  num_classes: 10
+  ln_neg: 1
diff --git a/configs/mnist/sym/rce.yaml b/configs/mnist/sym/rce.yaml
new file mode 100644
index 0000000..6ba45c2
--- /dev/null
+++ b/configs/mnist/sym/rce.yaml
@@ -0,0 +1,33 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: ReverseCrossEntropy
+  num_classes: 10
+  scale: 1.0
diff --git a/configs/mnist/sym/sce.yaml b/configs/mnist/sym/sce.yaml
new file mode 100644
index 0000000..7b8b748
--- /dev/null
+++ b/configs/mnist/sym/sce.yaml
@@ -0,0 +1,34 @@
+epochs: 50
+grad_bound: 5.0
+log_frequency: 200
+
+dataset:
+  name: DatasetGenerator
+  asym: False
+  train_batch_size: 128
+  eval_batch_size: 512
+  data_path: ../datasets
+  dataset_type: 'MNIST'
+  num_of_workers: 8
+
+model:
+  name: ToyModel
+  type: $dataset.dataset_type
+
+optimizer:
+  name: SGD
+  lr: 0.01
+  weight_decay: 1.e-2
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: CosineAnnealingLR
+  T_max: $epochs
+  eta_min: 0.001
+
+criterion:
+  name: SCELoss
+  alpha: 0.01
+  beta: 1.0
+  num_classes: 10
diff --git a/configs/webvision_mini/ce.yaml b/configs/webvision_mini/ce.yaml
new file mode 100644
index 0000000..fa81b00
--- /dev/null
+++ b/configs/webvision_mini/ce.yaml
@@ -0,0 +1,31 @@
+epochs: 250
+grad_bound: 5.0
+log_frequency: 50
+
+dataset:
+  name: WebVisionDatasetLoader
+  setting: 'mini'
+  train_batch_size: 512
+  eval_batch_size: 1024
+  train_data_path: '/var/local/tmp/datasets/'
+  valid_data_path: '/var/local/tmp/datasets/ILSVR2012'
+  num_of_workers: 8
+
+model:
+  name: resnet50
+  num_classes: 50
+
+optimizer:
+  name: SGD
+  lr: 0.4
+  weight_decay: 3.e-5
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: StepLR
+  step_size: 1
+  gamma: 0.97
+
+criterion:
+  name: CrossEntropyLoss
diff --git a/configs/webvision_mini/gce.yaml b/configs/webvision_mini/gce.yaml
new file mode 100644
index 0000000..2ae3ef5
--- /dev/null
+++ b/configs/webvision_mini/gce.yaml
@@ -0,0 +1,33 @@
+epochs: 250
+grad_bound: 5.0
+log_frequency: 50
+
+dataset:
+  name: WebVisionDatasetLoader
+  setting: 'mini'
+  train_batch_size: 512
+  eval_batch_size: 1024
+  train_data_path: '/var/local/tmp/datasets/'
+  valid_data_path: '/var/local/tmp/datasets/ILSVR2012'
+  num_of_workers: 8
+
+model:
+  name: resnet50
+  num_classes: 50
+
+optimizer:
+  name: SGD
+  lr: 0.4
+  weight_decay: 3.e-5
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: StepLR
+  step_size: 1
+  gamma: 0.97
+
+criterion:
+  name: GeneralizedCrossEntropy
+  num_classes: 50
+  q: 0.7
diff --git a/configs/webvision_mini/nce+mae.yaml b/configs/webvision_mini/nce+mae.yaml
new file mode 100644
index 0000000..ca7cbaa
--- /dev/null
+++ b/configs/webvision_mini/nce+mae.yaml
@@ -0,0 +1,34 @@
+epochs: 250
+grad_bound: 5.0
+log_frequency: 50
+
+dataset:
+  name: WebVisionDatasetLoader
+  setting: 'mini'
+  train_batch_size: 512
+  eval_batch_size: 1024
+  train_data_path: '/var/local/tmp/datasets/'
+  valid_data_path: '/var/local/tmp/datasets/ILSVR2012'
+  num_of_workers: 8
+
+model:
+  name: resnet50
+  num_classes: 50
+
+optimizer:
+  name: SGD
+  lr: 0.4
+  weight_decay: 3.e-5
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: StepLR
+  step_size: 1
+  gamma: 0.97
+
+criterion:
+  name: NCEandMAE
+  num_classes: 50
+  alpha: 50.0
+  beta: 1.0
diff --git a/configs/webvision_mini/nce+rce.yaml b/configs/webvision_mini/nce+rce.yaml
new file mode 100644
index 0000000..782e8ad
--- /dev/null
+++ b/configs/webvision_mini/nce+rce.yaml
@@ -0,0 +1,34 @@
+epochs: 250
+grad_bound: 5.0
+log_frequency: 50
+
+dataset:
+  name: WebVisionDatasetLoader
+  setting: 'mini'
+  train_batch_size: 512
+  eval_batch_size: 1024
+  train_data_path: '/var/local/tmp/datasets/'
+  valid_data_path: '/var/local/tmp/datasets/ILSVR2012'
+  num_of_workers: 8
+
+model:
+  name: resnet50
+  num_classes: 50
+
+optimizer:
+  name: SGD
+  lr: 0.4
+  weight_decay: 3.e-5
+  momentum: 0.9
+  nesterov: True
+  
+scheduler:
+  name: StepLR
+  step_size: 1
+  gamma: 0.97
+
+criterion:
+  name: NCEandRCE
+  num_classes: 50
+  alpha: 50.0
+  beta: 0.1
diff --git a/configs/webvision_mini/nfl+mae.yaml b/configs/webvision_mini/nfl+mae.yaml
new file mode 100644
index 0000000..6c62f2b
--- /dev/null
+++ b/configs/webvision_mini/nfl+mae.yaml
@@ -0,0 +1,35 @@
+epochs: 250
+grad_bound: 5.0
+log_frequency: 50
+
+dataset:
+  name: WebVisionDatasetLoader
+  setting: 'mini'
+  train_batch_size: 512
+  eval_batch_size: 1024
+  train_data_path: '/var/local/tmp/datasets/'
+  valid_data_path: '/var/local/tmp/datasets/ILSVR2012'
+  num_of_workers: 8
+
+model:
+  name: resnet50
+  num_classes: 50
+
+optimizer:
+  name: SGD
+  lr: 0.4
+  weight_decay: 3.e-5
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: StepLR
+  step_size: 1
+  gamma: 0.97
+
+criterion:
+  name: NFLandMAE
+  num_classes: 50
+  gamma: 0.5
+  alpha: 50.0
+  beta: 1.0
diff --git a/configs/webvision_mini/nfl+rce.yaml b/configs/webvision_mini/nfl+rce.yaml
new file mode 100644
index 0000000..250af5b
--- /dev/null
+++ b/configs/webvision_mini/nfl+rce.yaml
@@ -0,0 +1,35 @@
+epochs: 250
+grad_bound: 5.0
+log_frequency: 50
+
+dataset:
+  name: WebVisionDatasetLoader
+  setting: 'mini'
+  train_batch_size: 512
+  eval_batch_size: 1024
+  train_data_path: '/var/local/tmp/datasets/'
+  valid_data_path: '/var/local/tmp/datasets/ILSVR2012'
+  num_of_workers: 8
+
+model:
+  name: resnet50
+  num_classes: 50
+
+optimizer:
+  name: SGD
+  lr: 0.4
+  weight_decay: 3.e-5
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: StepLR
+  step_size: 1
+  gamma: 0.97
+
+criterion:
+  name: NFLandRCE
+  num_classes: 50
+  gamma: 0.5
+  alpha: 50.0
+  beta: 0.1
diff --git a/configs/webvision_mini/sce.yaml b/configs/webvision_mini/sce.yaml
new file mode 100644
index 0000000..6702128
--- /dev/null
+++ b/configs/webvision_mini/sce.yaml
@@ -0,0 +1,34 @@
+epochs: 250
+grad_bound: 5.0
+log_frequency: 50
+
+dataset:
+  name: WebVisionDatasetLoader
+  setting: 'mini'
+  train_batch_size: 512
+  eval_batch_size: 1024
+  train_data_path: '/var/local/tmp/datasets/'
+  valid_data_path: '/var/local/tmp/datasets/ILSVR2012'
+  num_of_workers: 8
+
+model:
+  name: resnet50
+  num_classes: 50
+
+optimizer:
+  name: SGD
+  lr: 0.4
+  weight_decay: 3.e-5
+  momentum: 0.9
+  nesterov: True
+
+scheduler:
+  name: StepLR
+  step_size: 1
+  gamma: 0.97
+
+criterion:
+  name: SCELoss
+  num_classes: 50
+  alpha: 10.0
+  beta: 1.0
diff --git a/dataset.py b/dataset.py
new file mode 100644
index 0000000..c96cf6e
--- /dev/null
+++ b/dataset.py
@@ -0,0 +1,720 @@
+from torchvision import datasets, transforms
+from torch.utils.data import DataLoader
+from PIL import Image
+from tqdm import tqdm
+from numpy.testing import assert_array_almost_equal
+import numpy as np
+import os
+import torch
+import random
+import mlconfig
+
+
+def build_for_cifar100(size, noise):
+    """ random flip between two random classes.
+    """
+    assert(noise >= 0.) and (noise <= 1.)
+
+    P = (1. - noise) * np.eye(size)
+    for i in np.arange(size - 1):
+        P[i, i+1] = noise
+
+    # adjust last row
+    P[size-1, 0] = noise
+
+    assert_array_almost_equal(P.sum(axis=1), 1, 1)
+    return P
+
+
+def multiclass_noisify(y, P, random_state=0):
+    """ Flip classes according to transition probability matrix T.
+    It expects a number between 0 and the number of classes - 1.
+    """
+
+    assert P.shape[0] == P.shape[1]
+    assert np.max(y) < P.shape[0]
+
+    # row stochastic matrix
+    assert_array_almost_equal(P.sum(axis=1), np.ones(P.shape[1]))
+    assert (P >= 0.0).all()
+
+    m = y.shape[0]
+    new_y = y.copy()
+    flipper = np.random.RandomState(random_state)
+
+    for idx in np.arange(m):
+        i = y[idx]
+        # draw a vector with only an 1
+        flipped = flipper.multinomial(1, P[i, :], 1)[0]
+        new_y[idx] = np.where(flipped == 1)[0]
+
+    return new_y
+
+
+def other_class(n_classes, current_class):
+    """
+    Returns a list of class indices excluding the class indexed by class_ind
+    :param nb_classes: number of classes in the task
+    :param class_ind: the class index to be omitted
+    :return: one random class that != class_ind
+    """
+    if current_class < 0 or current_class >= n_classes:
+        error_str = "class_ind must be within the range (0, nb_classes - 1)"
+        raise ValueError(error_str)
+
+    other_class_list = list(range(n_classes))
+    other_class_list.remove(current_class)
+    other_class = np.random.choice(other_class_list)
+    return other_class
+
+
+class MNISTNoisy(datasets.MNIST):
+    def __init__(self, root, train=True, transform=None, target_transform=None, download=True, nosiy_rate=0.0, asym=False, seed=0):
+        super(MNISTNoisy, self).__init__(root, transform=transform, target_transform=target_transform, download=download)
+        self.targets = self.targets.numpy()
+        if asym:
+            P = np.eye(10)
+            n = nosiy_rate
+
+            P[7, 7], P[7, 1] = 1. - n, n
+            # 2 -> 7
+            P[2, 2], P[2, 7] = 1. - n, n
+
+            # 5 <-> 6
+            P[5, 5], P[5, 6] = 1. - n, n
+            P[6, 6], P[6, 5] = 1. - n, n
+
+            # 3 -> 8
+            P[3, 3], P[3, 8] = 1. - n, n
+
+            y_train_noisy = multiclass_noisify(self.targets, P=P, random_state=seed)
+            actual_noise = (y_train_noisy != self.targets).mean()
+            assert actual_noise > 0.0
+            print('Actual noise %.2f' % actual_noise)
+            self.targets = y_train_noisy
+
+        else:
+            n_samples = len(self.targets)
+            n_noisy = int(nosiy_rate * n_samples)
+            print("%d Noisy samples" % (n_noisy))
+            class_index = [np.where(np.array(self.targets) == i)[0] for i in range(10)]
+            class_noisy = int(n_noisy / 10)
+            noisy_idx = []
+            for d in range(10):
+                noisy_class_index = np.random.choice(class_index[d], class_noisy, replace=False)
+                noisy_idx.extend(noisy_class_index)
+                print("Class %d, number of noisy % d" % (d, len(noisy_class_index)))
+            for i in noisy_idx:
+                self.targets[i] = other_class(n_classes=10, current_class=self.targets[i])
+            print(len(noisy_idx))
+
+        print("Print noisy label generation statistics:")
+        for i in range(10):
+            n_noisy = np.sum(np.array(self.targets) == i)
+            print("Noisy class %s, has %s samples." % (i, n_noisy))
+
+        return
+
+
+class cifar10Nosiy(datasets.CIFAR10):
+    def __init__(self, root, train=True, transform=None, target_transform=None, download=True, nosiy_rate=0.0, asym=False):
+        super(cifar10Nosiy, self).__init__(root, transform=transform, target_transform=target_transform)
+        self.download = download
+        if asym:
+            # automobile < - truck, bird -> airplane, cat <-> dog, deer -> horse
+            source_class = [9, 2, 3, 5, 4]
+            target_class = [1, 0, 5, 3, 7]
+            for s, t in zip(source_class, target_class):
+                cls_idx = np.where(np.array(self.targets) == s)[0]
+                n_noisy = int(nosiy_rate * cls_idx.shape[0])
+                noisy_sample_index = np.random.choice(cls_idx, n_noisy, replace=False)
+                for idx in noisy_sample_index:
+                    self.targets[idx] = t
+            return
+        elif nosiy_rate > 0:
+            n_samples = len(self.targets)
+            n_noisy = int(nosiy_rate * n_samples)
+            print("%d Noisy samples" % (n_noisy))
+            class_index = [np.where(np.array(self.targets) == i)[0] for i in range(10)]
+            class_noisy = int(n_noisy / 10)
+            noisy_idx = []
+            for d in range(10):
+                noisy_class_index = np.random.choice(class_index[d], class_noisy, replace=False)
+                noisy_idx.extend(noisy_class_index)
+                print("Class %d, number of noisy % d" % (d, len(noisy_class_index)))
+            for i in noisy_idx:
+                self.targets[i] = other_class(n_classes=10, current_class=self.targets[i])
+            print(len(noisy_idx))
+            print("Print noisy label generation statistics:")
+            for i in range(10):
+                n_noisy = np.sum(np.array(self.targets) == i)
+                print("Noisy class %s, has %s samples." % (i, n_noisy))
+            return
+
+
+class cifar100Nosiy(datasets.CIFAR100):
+    def __init__(self, root, train=True, transform=None, target_transform=None, download=True, nosiy_rate=0.0, asym=False, seed=0):
+        super(cifar100Nosiy, self).__init__(root, download=download, transform=transform, target_transform=target_transform)
+        self.download = download
+        if asym:
+            """mistakes are inside the same superclass of 10 classes, e.g. 'fish'
+            """
+            nb_classes = 100
+            P = np.eye(nb_classes)
+            n = nosiy_rate
+            nb_superclasses = 20
+            nb_subclasses = 5
+
+            if n > 0.0:
+                for i in np.arange(nb_superclasses):
+                    init, end = i * nb_subclasses, (i+1) * nb_subclasses
+                    P[init:end, init:end] = build_for_cifar100(nb_subclasses, n)
+
+                    y_train_noisy = multiclass_noisify(np.array(self.targets), P=P, random_state=seed)
+                    actual_noise = (y_train_noisy != np.array(self.targets)).mean()
+                assert actual_noise > 0.0
+                print('Actual noise %.2f' % actual_noise)
+                self.targets = y_train_noisy.tolist()
+            return
+        elif nosiy_rate > 0:
+            n_samples = len(self.targets)
+            n_noisy = int(nosiy_rate * n_samples)
+            print("%d Noisy samples" % (n_noisy))
+            class_index = [np.where(np.array(self.targets) == i)[0] for i in range(100)]
+            class_noisy = int(n_noisy / 100)
+            noisy_idx = []
+            for d in range(100):
+                noisy_class_index = np.random.choice(class_index[d], class_noisy, replace=False)
+                noisy_idx.extend(noisy_class_index)
+                print("Class %d, number of noisy % d" % (d, len(noisy_class_index)))
+            for i in noisy_idx:
+                self.targets[i] = other_class(n_classes=100, current_class=self.targets[i])
+            print(len(noisy_idx))
+            print("Print noisy label generation statistics:")
+            for i in range(100):
+                n_noisy = np.sum(np.array(self.targets) == i)
+                print("Noisy class %s, has %s samples." % (i, n_noisy))
+            return
+
+
+@mlconfig.register
+class DatasetGenerator():
+    def __init__(self,
+                 train_batch_size=128,
+                 eval_batch_size=256,
+                 data_path='data/',
+                 seed=123,
+                 num_of_workers=4,
+                 asym=False,
+                 dataset_type='CIFAR10',
+                 is_cifar100=False,
+                 cutout_length=16,
+                 noise_rate=0.4):
+        self.seed = seed
+        np.random.seed(seed)
+        self.train_batch_size = train_batch_size
+        self.eval_batch_size = eval_batch_size
+        self.data_path = data_path
+        self.num_of_workers = num_of_workers
+        self.cutout_length = cutout_length
+        self.noise_rate = noise_rate
+        self.dataset_type = dataset_type
+        self.asym = asym
+        self.data_loaders = self.loadData()
+        return
+
+    def getDataLoader(self):
+        return self.data_loaders
+
+    def loadData(self):
+        if self.dataset_type == 'MNIST':
+            MEAN = [0.1307]
+            STD = [0.3081]
+            train_transform = transforms.Compose([
+                transforms.ToTensor(),
+                transforms.Normalize(MEAN, STD)])
+
+            test_transform = transforms.Compose([
+                transforms.ToTensor(),
+                transforms.Normalize(MEAN, STD)])
+
+            train_dataset = MNISTNoisy(root=self.data_path,
+                                       train=True,
+                                       transform=train_transform,
+                                       download=True,
+                                       asym=self.asym,
+                                       seed=self.seed,
+                                       nosiy_rate=self.noise_rate)
+
+            test_dataset = datasets.MNIST(root=self.data_path,
+                                          train=False,
+                                          transform=test_transform,
+                                          download=True)
+
+        elif self.dataset_type == 'CIFAR100':
+            CIFAR_MEAN = [0.5071, 0.4865, 0.4409]
+            CIFAR_STD = [0.2673, 0.2564, 0.2762]
+
+            train_transform = transforms.Compose([
+                transforms.RandomCrop(32, padding=4),
+                transforms.RandomHorizontalFlip(),
+                transforms.RandomRotation(20),
+                transforms.ToTensor(),
+                transforms.Normalize(CIFAR_MEAN, CIFAR_STD)])
+
+            test_transform = transforms.Compose([
+                transforms.ToTensor(),
+                transforms.Normalize(CIFAR_MEAN, CIFAR_STD)])
+
+            train_dataset = cifar100Nosiy(root=self.data_path,
+                                          train=True,
+                                          transform=train_transform,
+                                          download=True,
+                                          asym=self.asym,
+                                          seed=self.seed,
+                                          nosiy_rate=self.noise_rate)
+
+            test_dataset = datasets.CIFAR100(root=self.data_path,
+                                             train=False,
+                                             transform=test_transform,
+                                             download=True)
+
+        elif self.dataset_type == 'CIFAR10':
+            CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124]
+            CIFAR_STD = [0.24703233, 0.24348505, 0.26158768]
+
+            train_transform = transforms.Compose([
+                transforms.RandomCrop(32, padding=4),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor(),
+                transforms.Normalize(CIFAR_MEAN, CIFAR_STD)])
+
+            test_transform = transforms.Compose([
+                transforms.ToTensor(),
+                transforms.Normalize(CIFAR_MEAN, CIFAR_STD)])
+
+            train_dataset = cifar10Nosiy(root=self.data_path,
+                                         train=True,
+                                         transform=train_transform,
+                                         download=True,
+                                         asym=self.asym,
+                                         nosiy_rate=self.noise_rate)
+
+            test_dataset = datasets.CIFAR10(root=self.data_path,
+                                            train=False,
+                                            transform=test_transform,
+                                            download=True)
+        else:
+            raise("Unknown Dataset")
+
+        data_loaders = {}
+
+        data_loaders['train_dataset'] = DataLoader(dataset=train_dataset,
+                                                   batch_size=self.train_batch_size,
+                                                   shuffle=True,
+                                                   pin_memory=True,
+                                                   num_workers=self.num_of_workers)
+
+        data_loaders['test_dataset'] = DataLoader(dataset=test_dataset,
+                                                  batch_size=self.eval_batch_size,
+                                                  shuffle=False,
+                                                  pin_memory=True,
+                                                  num_workers=self.num_of_workers)
+
+        print("Num of train %d" % (len(train_dataset)))
+        print("Num of test %d" % (len(test_dataset)))
+
+        return data_loaders
+
+
+class Clothing1MDataset:
+    def __init__(self, path, type='train', transform=None, target_transform=None):
+        self.path = path
+        if type == 'test':
+            flist = os.path.join(path, "annotations/clean_test.txt")
+        elif type == 'valid':
+            flist = os.path.join(path, "annotations/clean_val.txt")
+        elif type == 'train':
+            flist = os.path.join(path, "annotations/noisy_train.txt")
+        else:
+            raise('Unknown type')
+
+        self.imlist = self.flist_reader(flist)
+        self.transform = transform
+
+    def __len__(self):
+        return len(self.imlist)
+
+    def __getitem__(self, index):
+        impath, target = self.imlist[index]
+        img = Image.open(impath).convert("RGB")
+        if self.transform is not None:
+            img = self.transform(img)
+        return img, target
+
+    def flist_reader(self, flist):
+        imlist = []
+        with open(flist, 'r') as rf:
+            for line in rf.readlines():
+                row = line.split(" ")
+                impath = self.path + row[0]
+                imlabel = row[1]
+                imlist.append((impath, int(imlabel)))
+        return imlist
+
+
+@mlconfig.register
+class Clothing1MDatasetLoader:
+    def __init__(self, train_batch_size=128, eval_batch_size=256, data_path='data/', num_of_workers=4, use_cutout=True, cutout_length=112):
+        self.train_batch_size = train_batch_size
+        self.eval_batch_size = eval_batch_size
+        self.data_path = data_path
+        self.num_of_workers = num_of_workers
+        self.use_cutout = use_cutout
+        self.cutout_length = cutout_length
+        self.data_loaders = self.loadData()
+
+    def getDataLoader(self):
+        return self.data_loaders
+
+    def loadData(self):
+        MEAN = [0.485, 0.456, 0.406]
+        STD = [0.229, 0.224, 0.225]
+        train_transform = transforms.Compose([
+            transforms.RandomResizedCrop(224),
+            transforms.RandomHorizontalFlip(),
+            transforms.RandomRotation(20),
+            transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.2),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=MEAN, std=STD),
+         ])
+        test_transform = transforms.Compose([
+            transforms.Resize((224, 224)),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=MEAN, std=STD)
+        ])
+        if self.use_cutout:
+            print('Using Cutout')
+            train_transform.transforms.append(Cutout(self.cutout_length))
+
+        train_dataset = Clothing1MDataset(path=self.data_path,
+                                          type='train',
+                                          transform=train_transform)
+
+        test_dataset = Clothing1MDataset(path=self.data_path,
+                                         type='test',
+                                         transform=test_transform)
+
+        valid_dataset = Clothing1MDataset(path=self.data_path,
+                                          type='valid',
+                                          transform=test_transform)
+
+        data_loaders = {}
+
+        data_loaders['train_dataset'] = DataLoader(dataset=train_dataset,
+                                                   batch_size=self.train_batch_size,
+                                                   shuffle=True,
+                                                   pin_memory=True,
+                                                   num_workers=self.num_of_workers)
+
+        data_loaders['test_dataset'] = DataLoader(dataset=test_dataset,
+                                                  batch_size=self.eval_batch_size,
+                                                  shuffle=False,
+                                                  pin_memory=True,
+                                                  num_workers=self.num_of_workers)
+
+        data_loaders['valid_dataset'] = DataLoader(dataset=valid_dataset,
+                                                   batch_size=self.eval_batch_size,
+                                                   shuffle=False,
+                                                   pin_memory=True,
+                                                   num_workers=self.num_of_workers)
+        return data_loaders
+
+
+class WebVisionDataset:
+    def __init__(self, path, file_name='webvision_mini_train', transform=None, target_transform=None):
+        self.target_list = []
+        self.path = path
+        self.load_file(os.path.join(path, file_name))
+        self.transform = transform
+        self.target_transform = target_transform
+        return
+
+    def load_file(self, filename):
+        f = open(filename, "r")
+        for line in f:
+            train_file, label = line.split()
+            self.target_list.append((train_file, int(label)))
+        f.close()
+        return
+
+    def __len__(self):
+        return len(self.target_list)
+
+    def __getitem__(self, index):
+        impath, target = self.target_list[index]
+        img = Image.open(os.path.join(self.path, impath)).convert("RGB")
+        if self.transform is not None:
+            img = self.transform(img)
+        return img, target
+
+
+@mlconfig.register
+class WebVisionDatasetLoader:
+    def __init__(self, setting='mini', train_batch_size=128, eval_batch_size=256, train_data_path='data/', valid_data_path='data/', num_of_workers=4):
+        self.train_batch_size = train_batch_size
+        self.eval_batch_size = eval_batch_size
+        self.train_data_path = train_data_path
+        self.valid_data_path = valid_data_path
+        self.num_of_workers = num_of_workers
+        self.setting = setting
+        self.data_loaders = self.loadData()
+
+    def getDataLoader(self):
+        return self.data_loaders
+
+    def loadData(self):
+        IMAGENET_MEAN = [0.485, 0.456, 0.406]
+        IMAGENET_STD = [0.229, 0.224, 0.225]
+        train_transform = transforms.Compose([transforms.RandomResizedCrop(224),
+                                              transforms.RandomHorizontalFlip(),
+                                              transforms.ColorJitter(brightness=0.4,
+                                                                     contrast=0.4,
+                                                                     saturation=0.4,
+                                                                     hue=0.2),
+                                              transforms.ToTensor(),
+                                              transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)])
+
+        test_transform = transforms.Compose([transforms.Resize(256),
+                                             transforms.CenterCrop(224),
+                                             transforms.ToTensor(),
+                                             transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)])
+
+        if self.setting == 'mini':
+            train_dataset = WebVisionDataset(path=self.train_data_path,
+                                             file_name='webvision_mini_train.txt',
+                                             transform=train_transform)
+
+            test_dataset = ImageNetMini(root=self.valid_data_path,
+                                        split='val',
+                                        transform=test_transform)
+
+        elif self.setting == 'full':
+            train_dataset = WebVisionDataset(path=self.train_data_path,
+                                             file_name='train_filelist_google.txt',
+                                             transform=train_transform)
+
+            test_dataset = WebVisionDataset(path=self.valid_data_path,
+                                            file_name='val_filelist.txt',
+                                            transform=test_transform)
+
+        elif self.setting == 'full_imagenet':
+            train_dataset = WebVisionDataset(path=self.train_data_path,
+                                             file_name='train_filelist_google',
+                                             transform=train_transform)
+
+            test_dataset = datasets.ImageNet(root=self.valid_data_path,
+                                             split='val',
+                                             transform=test_transform)
+
+        else:
+            raise(NotImplementedError)
+
+        data_loaders = {}
+
+        print('Training Set Size %d' % (len(train_dataset)))
+        print('Test Set Size %d' % (len(test_dataset)))
+
+        data_loaders['train_dataset'] = DataLoader(dataset=train_dataset,
+                                                   batch_size=self.train_batch_size,
+                                                   shuffle=True,
+                                                   pin_memory=True,
+                                                   num_workers=self.num_of_workers)
+
+        data_loaders['test_dataset'] = DataLoader(dataset=test_dataset,
+                                                  batch_size=self.eval_batch_size,
+                                                  shuffle=False,
+                                                  pin_memory=True,
+                                                  num_workers=self.num_of_workers)
+
+        return data_loaders
+
+
+class ImageNetMini(datasets.ImageNet):
+    def __init__(self, root, split='val', download=False, **kwargs):
+        super(ImageNetMini, self).__init__(root, download=download, split=split, **kwargs)
+        self.new_targets = []
+        self.new_images = []
+        for i, (file, cls_id) in enumerate(self.imgs):
+            if cls_id <= 49:
+                self.new_targets.append(cls_id)
+                self.new_images.append((file, cls_id))
+                print((file, cls_id))
+        self.imgs = self.new_images
+        self.targets = self.new_targets
+        self.samples = self.imgs
+        print(len(self.samples))
+        print(len(self.targets))
+        return
+
+
+class NosieImageNet(datasets.ImageNet):
+    def __init__(self, root, split='train', seed=999, download=False, target_class_num=200, nosiy_rate=0.4, **kwargs):
+        super(NosieImageNet, self).__init__(root, download=download, split=split, **kwargs)
+        random.seed(seed)
+        np.random.seed(seed)
+        self.new_idx = random.sample(list(range(0, 1000)), k=target_class_num)
+        print(len(self.new_idx), len(self.imgs))
+        self.new_imgs = []
+        self.new_targets = []
+
+        for file, cls_id in self.imgs:
+            if cls_id in self.new_idx:
+                new_idx = self.new_idx.index(cls_id)
+                self.new_imgs.append((file, new_idx))
+                self.new_targets.append(new_idx)
+        self.imgs = self.new_imgs
+        self.targets = self.new_targets
+        print(min(self.targets), max(self.targets))
+        # Noise
+        if split == 'train':
+            n_samples = len(self.targets)
+            n_noisy = int(nosiy_rate * n_samples)
+            print("%d Noisy samples" % (n_noisy))
+            class_index = [np.where(np.array(self.targets) == i)[0] for i in range(target_class_num)]
+            class_noisy = int(n_noisy / target_class_num)
+            noisy_idx = []
+            for d in range(target_class_num):
+                print(len(class_index[d]), d)
+                noisy_class_index = np.random.choice(class_index[d], class_noisy, replace=False)
+                noisy_idx.extend(noisy_class_index)
+                print("Class %d, number of noisy % d" % (d, len(noisy_class_index)))
+            for i in noisy_idx:
+                self.targets[i] = other_class(n_classes=target_class_num, current_class=self.targets[i])
+                (file, old_idx) = self.imgs[i]
+                self.imgs[i] = (file, self.targets[i])
+            print(len(noisy_idx))
+            print("Print noisy label generation statistics:")
+            for i in range(target_class_num):
+                n_noisy = np.sum(np.array(self.targets) == i)
+                print("Noisy class %s, has %s samples." % (i, n_noisy))
+
+        self.samples = self.imgs
+
+
+class ImageNetDatasetLoader:
+    def __init__(self,
+                 batchSize=128,
+                 eval_batch_size=256,
+                 dataPath='data/',
+                 seed=999,
+                 target_class_num=200,
+                 nosiy_rate=0.4,
+                 numOfWorkers=4):
+        self.batchSize = batchSize
+        self.eval_batch_size = eval_batch_size
+        self.dataPath = dataPath
+        self.numOfWorkers = numOfWorkers
+        self.seed = seed
+        self.target_class_num = target_class_num
+        self.nosiy_rate = nosiy_rate
+        self.data_loaders = self.loadData()
+
+    def getDataLoader(self):
+        return self.data_loaders
+
+    def loadData(self):
+        IMAGENET_MEAN = [0.485, 0.456, 0.406]
+        IMAGENET_STD = [0.229, 0.224, 0.225]
+
+        train_transform = transforms.Compose([
+            transforms.RandomResizedCrop(224),
+            transforms.RandomHorizontalFlip(),
+            transforms.ColorJitter(brightness=0.4,
+                                   contrast=0.4,
+                                   saturation=0.4,
+                                   hue=0.2),
+            transforms.ToTensor(),
+            transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)])
+
+        test_transform = transforms.Compose([
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)])
+
+        train_dataset = NosieImageNet(root=self.dataPath,
+                                      split='train',
+                                      nosiy_rate=self.nosiy_rate,
+                                      target_class_num=self.target_class_num,
+                                      seed=self.seed,
+                                      transform=train_transform,
+                                      download=True)
+
+        test_dataset = NosieImageNet(root=self.dataPath,
+                                     split='val',
+                                     nosiy_rate=self.nosiy_rate,
+                                     target_class_num=self.target_class_num,
+                                     seed=self.seed,
+                                     transform=test_transform,
+                                     download=True)
+
+        data_loaders = {}
+
+        data_loaders['train_dataset'] = DataLoader(dataset=train_dataset,
+                                                   batch_size=self.batchSize,
+                                                   shuffle=True,
+                                                   pin_memory=True,
+                                                   num_workers=self.numOfWorkers)
+
+        data_loaders['test_dataset'] = DataLoader(dataset=test_dataset,
+                                                  batch_size=self.batchSize,
+                                                  shuffle=False,
+                                                  pin_memory=True,
+                                                  num_workers=self.numOfWorkers)
+        return data_loaders
+
+
+def online_mean_and_sd(loader):
+    """Compute the mean and sd in an online fashion
+
+        Var[x] = E[X^2] - E^2[X]
+    """
+    cnt = 0
+    fst_moment = torch.empty(3)
+    snd_moment = torch.empty(3)
+
+    for data, _ in tqdm(loader):
+
+        b, c, h, w = data.shape
+        nb_pixels = b * h * w
+        sum_ = torch.sum(data, dim=[0, 2, 3])
+        sum_of_square = torch.sum(data ** 2, dim=[0, 2, 3])
+        fst_moment = (cnt * fst_moment + sum_) / (cnt + nb_pixels)
+        snd_moment = (cnt * snd_moment + sum_of_square) / (cnt + nb_pixels)
+
+        cnt += nb_pixels
+
+    return fst_moment, torch.sqrt(snd_moment - fst_moment ** 2)
+
+
+class Cutout(object):
+    def __init__(self, length):
+        self.length = length
+
+    def __call__(self, img):
+        h, w = img.size(1), img.size(2)
+        mask = np.ones((h, w), np.float32)
+        y = np.random.randint(h)
+        x = np.random.randint(w)
+
+        y1 = np.clip(y - self.length // 2, 0, h)
+        y2 = np.clip(y + self.length // 2, 0, h)
+        x1 = np.clip(x - self.length // 2, 0, w)
+        x2 = np.clip(x + self.length // 2, 0, w)
+
+        mask[y1: y2, x1: x2] = 0.
+        mask = torch.from_numpy(mask)
+        mask = mask.expand_as(img)
+        img *= mask
+        return img
diff --git a/datasets.py b/datasets.py
deleted file mode 100644
index 5deb88a..0000000
--- a/datasets.py
+++ /dev/null
@@ -1,161 +0,0 @@
-import os
-import multiprocessing as mp
-from subprocess import call
-import warnings
-import numpy as np
-import scipy.io as sio
-import numpy as np
-import keras.backend as K
-from keras.datasets import mnist, cifar10, cifar100
-from keras.utils import np_utils
-from util import other_class
-
-# Set random seed
-np.random.seed(123)
-
-NUM_CLASSES = {'mnist': 10, 'svhn': 10, 'cifar-10': 10, 'cifar-100': 100}
-
-def get_data(dataset='mnist', noise_ratio=0, random_shuffle=False):
-    """
-    Get training images with specified ratio of label noise
-    :param dataset:
-    :param noise_ratio: 0 - 100 (%)
-    :param random_shuffle:
-    :return: 
-    """
-    if dataset == 'mnist':
-        (X_train, y_train), (X_test, y_test) = mnist.load_data()
-
-        X_train = X_train.reshape(-1, 28, 28, 1)
-        X_test = X_test.reshape(-1, 28, 28, 1)
-
-        X_train = X_train / 255.0
-        X_test = X_test / 255.0
-
-    elif dataset == 'svhn':
-        if not os.path.isfile("data/svhn_train.mat"):
-            print('Downloading SVHN train set...')
-            call(
-                "curl -o data/svhn_train.mat "
-                "http://ufldl.stanford.edu/housenumbers/train_32x32.mat",
-                shell=True
-            )
-        if not os.path.isfile("data/svhn_test.mat"):
-            print('Downloading SVHN test set...')
-            call(
-                "curl -o data/svhn_test.mat "
-                "http://ufldl.stanford.edu/housenumbers/test_32x32.mat",
-                shell=True
-            )
-        train = sio.loadmat('data/svhn_train.mat')
-        test = sio.loadmat('data/svhn_test.mat')
-        X_train = np.transpose(train['X'], axes=[3, 0, 1, 2])
-        X_test = np.transpose(test['X'], axes=[3, 0, 1, 2])
-
-        X_train = X_train / 255.0
-        X_test = X_test / 255.0
-
-        means = X_train.mean(axis=0)
-        # std = np.std(X_train)
-        X_train = (X_train - means)  # / std
-        X_test = (X_test - means)  # / std
-
-        # reshape (n_samples, 1) to (n_samples,) and change 1-index
-        # to 0-index
-        y_train = np.reshape(train['y'], (-1,)) - 1
-        y_test = np.reshape(test['y'], (-1,)) - 1
-
-    elif dataset == 'cifar-10':
-        (X_train, y_train), (X_test, y_test) = cifar10.load_data()
-
-        X_train = X_train.reshape(-1, 32, 32, 3)
-        X_test = X_test.reshape(-1, 32, 32, 3)
-
-        X_train = X_train / 255.0
-        X_test = X_test / 255.0
-
-        means = X_train.mean(axis=0)
-        # std = np.std(X_train)
-        X_train = (X_train - means)  # / std
-        X_test = (X_test - means)  # / std
-
-        # they are 2D originally in cifar
-        y_train = y_train.ravel()
-        y_test = y_test.ravel()
-
-    elif dataset == 'cifar-100':
-        # num_classes = 100
-        (X_train, y_train), (X_test, y_test) = cifar100.load_data()
-
-        X_train = X_train.reshape(-1, 32, 32, 3)
-        X_test = X_test.reshape(-1, 32, 32, 3)
-
-        X_train = X_train / 255.0
-        X_test = X_test / 255.0
-
-        means = X_train.mean(axis=0)
-        # std = np.std(X_train)
-        X_train = (X_train - means)  # / std
-        X_test = (X_test - means)  # / std
-
-        # they are 2D originally in cifar
-        y_train = y_train.ravel()
-        y_test = y_test.ravel()
-    else:
-        return None, None, None, None
-
-
-    X_train = X_train.astype('float32')
-    X_test = X_test.astype('float32')
-
-    # generate random noisy labels
-    if noise_ratio > 0:
-        data_file = "data/%s_train_labels_%s.npy" % (dataset, noise_ratio)
-        if os.path.isfile(data_file):
-            y_train = np.load(data_file)
-        else:
-            n_samples = y_train.shape[0]
-            n_noisy = int(noise_ratio*n_samples/100)
-            noisy_idx = np.random.choice(n_samples, n_noisy, replace=False)
-            for i in noisy_idx:
-                y_train[i] = other_class(n_classes=NUM_CLASSES[dataset], current_class=y_train[i])
-            np.save(data_file, y_train)
-
-    if random_shuffle:
-        # random shuffle
-        idx_perm = np.random.permutation(X_train.shape[0])
-        X_train, y_train = X_train[idx_perm], y_train[idx_perm]
-
-    # one-hot-encode the labels
-    y_train = np_utils.to_categorical(y_train, NUM_CLASSES[dataset])
-    y_test = np_utils.to_categorical(y_test, NUM_CLASSES[dataset])
-
-    print("X_train:", X_train.shape)
-    print("y_train:", y_train.shape)
-    print("X_test:", X_test.shape)
-    print("y_test", y_test.shape)
-
-    return X_train, y_train, X_test, y_test
-
-
-def validatation_split(X, y, split=0.1):
-    """
-    split data to train and validation set, based on the split ratios
-    :param X: 
-    :param y: 
-    :param split: 
-    :return: 
-    """
-    idx_val = np.round(split * X.shape[0]).astype(int)
-    X_val, y_val = X[:idx_val], y[:idx_val]
-    X_train, y_train = X[idx_val:], y[idx_val:]
-    return X_train, y_train, X_val, y_val
-
-
-if __name__ == "__main__":
-    X_train, Y_train, X_test, Y_test = get_data(dataset='mnist', noise_ratio=40)
-    Y_train = np.argmax(Y_train, axis=1)
-    (_, Y_clean_train), (_, Y_clean_test) = mnist.load_data()
-    clean_selected = np.argwhere(Y_train == Y_clean_train).reshape((-1,))
-    noisy_selected = np.argwhere(Y_train != Y_clean_train).reshape((-1,))
-    print("#correct labels: %s, #incorrect labels: %s" % (len(clean_selected), len(noisy_selected)))
\ No newline at end of file
diff --git a/evaluator.py b/evaluator.py
new file mode 100644
index 0000000..180e6b4
--- /dev/null
+++ b/evaluator.py
@@ -0,0 +1,88 @@
+import time
+import torch
+import os
+from util import log_display, accuracy, AverageMeter
+
+if torch.cuda.is_available():
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.benchmark = True
+    torch.backends.cudnn.deterministic = True
+    device = torch.device('cuda')
+else:
+    device = torch.device('cpu')
+
+
+class Evaluator():
+    def __init__(self, data_loader, logger, config, name='Evaluator', metrics='classfication', summary_writer=None):
+        self.data_loader = data_loader
+        self.logger = logger
+        self.name = name
+        self.summary_writer = summary_writer
+        self.step = 0
+        self.config = config
+        self.log_frequency = config.log_frequency
+        self.loss_meters = AverageMeter()
+        self.acc_meters = AverageMeter()
+        self.acc5_meters = AverageMeter()
+        self.report_metrics = self.classfication_metrics if metrics == 'classfication' else self.regression_metrics
+        return
+
+    def log(self, epoch, GLOBAL_STEP):
+        display = log_display(epoch=epoch,
+                              global_step=GLOBAL_STEP,
+                              time_elapse=self.time_used,
+                              **self.logger_payload)
+        self.logger.info(display)
+
+    def eval(self, epoch, GLOBAL_STEP, model, criterion):
+        for i, (images, labels) in enumerate(self.data_loader):
+            self.eval_batch(x=images, y=labels, model=model, criterion=criterion)
+        self.log(epoch, GLOBAL_STEP)
+        return
+
+    def eval_batch(self, x, y, model, criterion):
+        model.eval()
+        x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
+        start = time.time()
+        with torch.no_grad():
+            pred, _ = model(x)
+            loss = criterion(pred, y)
+        end = time.time()
+        self.time_used = end - start
+        self.step += 1
+        self.report_metrics(pred, y, loss)
+        return
+
+    def classfication_metrics(self, x, y, loss):
+        acc, acc5 = accuracy(x, y, topk=(1, 5))
+        self.loss_meters.update(loss.item(), y.shape[0])
+        self.acc_meters.update(acc.item(), y.shape[0])
+        self.acc5_meters.update(acc5.item(), y.shape[0])
+        self.logger_payload = {"acc": acc,
+                               "acc_avg": self.acc_meters.avg,
+                               "top5_acc": acc5,
+                               "top5_acc_avg": self.acc5_meters.avg,
+                               "loss": loss,
+                               "loss_avg": self.loss_meters.avg}
+
+        if self.summary_writer is not None:
+            self.summary_writer.add_scalar(os.path.join(self.name, 'acc'), acc, self.step)
+            self.summary_writer.add_scalar(os.path.join(self.name, 'loss'), loss, self.step)
+
+    def regression_metrics(self, x, y, loss):
+        diff = abs((x - y).mean().detach().item())
+        self.loss_meters.update(loss.item(), y.shape[0])
+        self.acc_meters.update(diff, y.shape[0])
+        self.logger_payload = {"|diff|": diff,
+                               "|diff_avg|": self.acc_meters.avg,
+                               "loss": loss,
+                               "loss_avg": self.loss_meters.avg}
+
+        if self.summary_writer is not None:
+            self.summary_writer.add_scalar(os.path.join(self.name, 'diff'), diff, self.step)
+            self.summary_writer.add_scalar(os.path.join(self.name, 'loss'), loss, self.step)
+
+    def _reset_stats(self):
+        self.loss_meters.reset()
+        self.acc_meters.reset()
+        self.acc5_meters.reset()
diff --git a/lass.py b/lass.py
new file mode 100644
index 0000000..4f68dd1
--- /dev/null
+++ b/lass.py
@@ -0,0 +1,68 @@
+import torch
+import torch.nn.functional as F
+
+class lass(object):
+    def __init__(self, model, device, a=0.25/255., b=0.2/255., r=0.3/255., iter_max=100, clip_min=-1.0e8, clip_max=1.0e8):
+        # x and y_target are tensorflow placeholders, y_pred is the model output tensorflow tensor
+        # SEARCH PARAMETERS: a- gradient sign coefficient; b- noise coefficient; r- search radius per pixel; iter- max number of iters
+        self.a = a
+        self.b = b
+        self.r = r
+        self.model = model
+        self.device = device
+        self.iter_max = iter_max
+        self.clip_min = clip_min
+        self.clip_max = clip_max
+        
+    def find(self, X):
+        # elements of X in [0,1] for using default params a,b,r; otherwise scale accordingly
+        # generate max output label
+        X.requires_grad_(True)
+        pred, _ = self.model(X)
+        pred = F.softmax(pred, dim=1)
+        Y_pred_vec = torch.argmax(pred, dim=1)
+        Y_pred = F.one_hot(Y_pred_vec, pred.shape[1]).float()
+        
+        X_adv = 1.*X
+        adv_ind = torch.zeros(X.shape[0],dtype=torch.bool,device=self.device)
+        converged = False
+        converged_label_thres = 3
+        adv_num_old = 0 
+        i = 0
+        Y_pred_adv = pred
+        while i < self.iter_max and converged == False:
+            # I would recommend annealing the noise coefficient b gradually in this while loop
+            #print('on iter %s' % i)
+            i += 1
+            #X_adv.requires_grad_(True)
+            loss = F.cross_entropy(Y_pred_adv, Y_pred_vec)
+            if i == 1:
+                grad = torch.autograd.grad(loss, X)[0]
+            else:
+                grad = torch.autograd.grad(loss, X_adv)[0]
+            X_adv = X_adv.detach()
+            
+  
+            step = self.a * torch.sign(grad) + self.b * torch.randn(*grad.shape, device=self.device)
+            X_adv += step
+            diff = X_adv - X
+            abs_diff = torch.abs(diff)
+            ind = abs_diff > self.r
+            X_adv[ind] = X[ind] + self.r * torch.sign(diff[ind])  
+            X_adv = torch.clamp(X_adv, self.clip_min , self.clip_max )
+            
+            X_adv.requires_grad_(True)
+            Y_pred_adv, _ = self.model(X_adv)
+            Y_pred_adv = F.softmax(Y_pred_adv, dim=1)
+            Y_pred_adv_vec = torch.argmax(Y_pred_adv, dim=1)
+            # if we ever identify a sample as critical sample, record it
+            adv_ind = adv_ind | ~torch.eq(Y_pred_vec, Y_pred_adv_vec).to(self.device)
+            adv_num_new = torch.sum(adv_ind)
+            #print('number of adv samples: %s' % adv_num_new)
+            
+            if adv_num_new - adv_num_old < converged_label_thres:
+                converged = True
+                
+            adv_num_old = adv_num_new
+            
+        return X_adv, adv_ind
\ No newline at end of file
diff --git a/lass_tf.py b/lass_tf.py
deleted file mode 100644
index f7d061c..0000000
--- a/lass_tf.py
+++ /dev/null
@@ -1,82 +0,0 @@
-"""
-Code from Devansh Arpit
-2017 - icml - A Closer Look at Memorization in Deep Networks
-Adapted by Xingjun Ma to this tensorflow version.
-"""
-
-import numpy as np
-import keras.backend as K
-
-class lass(object):
-    def __init__(self, x, y_pred, y_target, a=0.25/255., b=0.2/255., r=0.3/255., iter_max=100, clip_min=-np.inf, clip_max=np.inf):
-        # x and y_target are tensorflow placeholders, y_pred is the model output tensorflow tensor
-        # SEARCH PARAMETERS: a- gradient sign coefficient; b- noise coefficient; r- search radius per pixel; iter- max number of iters
-        self.a = a
-        self.b = b
-        self.r = r
-        self.iter_max = iter_max
-        self.clip_min = clip_min
-        self.clip_max = clip_max
-    
-        loss = K.categorical_crossentropy(y_pred, y_target)
-        grads = K.gradients(K.mean(loss), x)[0] # this will return a list of tensors not one tensor
-
-        self.grad_fn = K.function(inputs=[x, y_target] + [K.learning_phase()],
-                                  outputs=[grads])
-        self.pred_fn = K.function(inputs=[x] + [K.learning_phase()],
-                                  outputs=[y_pred])
-        
-    def find(self, X, bs=500):
-        # elements of X in [0,1] for using default params a,b,r; otherwise scale accordingly
-        # generate max output label
-        for batch in range(int(X.shape[0] / bs)):
-            pred_this = self.pred_fn([X[bs * batch: bs * (batch + 1)], 0])[0]
-            if not hasattr(self, 'Y_pred_exists'):
-                self.Y_pred_exists=True
-                Y_pred = np.zeros(shape=(X.shape[0], pred_this.shape[1]), dtype=np.float32)
-            Y_pred[bs * batch: bs * (batch + 1)] = (pred_this // np.max(pred_this, axis=1)[:, None])
-        
-        Y_pred_vec = np.argmax(Y_pred, axis=1)
-        
-        X_adv = 1.*X
-        adv_ind = np.asarray(np.zeros((X.shape[0],)), dtype='bool')
-        converged = False
-        converged_label_thres = 20
-        adv_num_old = 0 
-        i = 0
-        while i < self.iter_max and converged == False:
-            # I would recommend annealing the noise coefficient b gradually in this while loop
-            # print('on iter %s' % i)
-            i += 1
-            pred_adv = []
-            for batch in range(int(X.shape[0] / bs)):
-                grad_this = self.grad_fn([X_adv[bs * batch: bs * (batch + 1)], Y_pred[bs * batch: bs * (batch + 1)], 0])[0]
-                
-                step = self.a * np.sign(grad_this) + self.b * np.random.randn(*grad_this.shape)
-                X_adv[bs * batch: bs * (batch + 1)] += step
-                diff = X_adv[bs * batch: bs * (batch + 1)] - X[bs * batch: bs * (batch + 1)]
-                abs_diff = np.abs(diff)
-                ind = abs_diff > self.r
-                X_adv[bs * batch: bs * (batch + 1)][ind] = X[bs * batch: bs * (batch + 1)][ind] + self.r * np.sign(
-                    diff[ind])  
-                X_adv[bs * batch: bs * (batch + 1)] = np.clip(X_adv[bs * batch: bs * (batch + 1)], \
-                                                                 self.clip_min , self.clip_max )
-    
-                X_adv_this = X_adv[bs * batch: bs * (batch + 1)]
-                pred_this_adv = self.pred_fn([X_adv_this, 0])[0]
-                pred_this_adv = np.argmax(pred_this_adv, axis=1)
-                pred_adv.extend(list(pred_this_adv))
-    
-            pred_adv = np.asarray(pred_adv)
-            
-            # if we ever identify a sample as critical sample, record it
-            adv_ind = adv_ind + (Y_pred_vec != pred_adv)
-            adv_num_new = np.sum(adv_ind)
-            # print('number of adv samples: %s' % adv_num_new)
-            
-            if adv_num_new - adv_num_old < converged_label_thres:
-                converged = True
-                
-            adv_num_old = adv_num_new
-            
-        return X_adv, adv_ind
\ No newline at end of file
diff --git a/lid.py b/lid.py
new file mode 100644
index 0000000..38c8a2c
--- /dev/null
+++ b/lid.py
@@ -0,0 +1,50 @@
+import torch
+from scipy.spatial.distance import cdist
+
+def gmean(input_x, dim=0):
+    log_x = torch.log(input_x)
+    return torch.exp(torch.mean(log_x, dim=dim))
+
+def get_lid_r(data, reference):
+    b = data.shape[0]
+    data = torch.flatten(data, start_dim=1)
+    reference = torch.flatten(reference, start_dim=1)
+    r = torch.cdist(data, reference, p=2)
+    a, idx = torch.sort(r, dim=1)
+    return r, a, idx
+
+def lid_mle(data, reference, k=20, get_idx=False, compute_mode='use_mm_for_euclid_dist_if_necessary'):
+    data = torch.flatten(data, start_dim=1)
+    reference = torch.flatten(reference, start_dim=1)
+    r = torch.cdist(reference, data, p=2, compute_mode=compute_mode)
+    a, idx = torch.sort(r, dim=1)
+    lids = -k / torch.sum(torch.log(a[:, 1:k+1] / a[:, k+1].view(-1,1)), dim=1)
+    if get_idx:
+        return idx, lids
+    return lids
+
+def lid_mom_est(data, reference, k, get_idx=False, compute_mode='use_mm_for_euclid_dist_if_necessary'):
+    b = data.shape[0]
+    k = min(k, b-2)
+    data = torch.flatten(data, start_dim=1)
+    reference = torch.flatten(reference, start_dim=1)
+    r = torch.cdist(data, reference, p=2, compute_mode=compute_mode)
+    a, idx = torch.sort(r, dim=1)
+    m = torch.mean(a[:, 1:k], dim=1)
+    lids = m / (a[:, k] - m)
+    if get_idx:
+        return idx, lids
+    return lids
+
+def lid_mom_est_eps(data, reference, k, get_idx=False):
+    b = data.shape[0]
+    k = min(k, b-2)
+    data = torch.flatten(data, start_dim=1)
+    reference = torch.flatten(reference, start_dim=1)
+    r = torch.cdist(data, reference, p=2)
+    a, idx = torch.sort(r, dim=1)
+    m = torch.mean(a[:, 1:k], dim=1)
+    lids = m / ((a[:, k] - m) + 1.e-4)
+    if get_idx:
+        return idx, lids
+    return lids
\ No newline at end of file
diff --git a/lid_plot.py b/lid_plot.py
deleted file mode 100644
index 1229fdb..0000000
--- a/lid_plot.py
+++ /dev/null
@@ -1,177 +0,0 @@
-"""
-Date: 28/07/2017
-LID exploration and visualization
-
-Author: Xingjun Ma
-"""
-import os
-import numpy as np
-import keras.backend as K
-from keras.datasets import mnist, cifar10
-import matplotlib.pyplot as plt
-from sklearn.decomposition import PCA
-from keras.optimizers import SGD
-from keras.utils import np_utils, to_categorical
-from util import get_lids_random_batch, mle_batch
-from datasets import get_data, validatation_split
-from models import get_model
-from loss import cross_entropy, boot_soft, boot_hard
-from scipy.interpolate import spline, interp1d
-
-np.random.seed(1024)
-
-MODELS = ['ce', 'forward', 'backward', 'boot_soft', 'boot_hard', 'lid_dataset']
-MODEL_LABELS = ['cross-entropy', 'forward', 'backward', 'boot-soft', 'boot-hard', 'D2L']
-COLORS = ['r', 'y', 'c', 'm', 'g', 'b']
-MARKERS = ['x', 'D', '<', '>', '^', 'o']
-
-
-def lid_trend_through_training(model_name='ce', dataset='mnist', noise_type='sym', noise_ratio=0.):
-    """
-    plot the lid trend for clean vs noisy samples through training.
-    This can provide some information about manifold learning dynamics through training.
-    """
-    print('Dataset: %s, noise type: %s, noise ratio: %.1f' % (dataset, noise_type, noise_ratio))
-
-    lids, acc_train, acc_test = None, None, None
-
-    # get LID of raw inputs
-    lid_subset = 128
-    k = 20
-    X_train, Y_train, X_test, Y_test = get_data(dataset)
-    rand_idxes = np.random.choice(X_train.shape[0], lid_subset * 10, replace=False)
-    X_train = X_train[rand_idxes]
-    X_train = X_train.reshape((X_train.shape[0], -1))
-
-    lid_tmp = []
-    for i in range(10):
-        s = i * 128
-        e = (i+1)*128
-        lid_tmp.extend(mle_batch(X_train[s:e], X_train[s:e], k=k))
-    lid_X = np.mean(lid_tmp)
-    print('LID of input X: ', lid_X)
-
-    # load pre-saved to avoid recomputing
-    lid_saved = "log/lid_%s_%s_%s%s.npy" % (model_name, dataset, noise_type, noise_ratio)
-    acc_saved = "log/acc_%s_%s_%s%s.npy" % (model_name, dataset, noise_type, noise_ratio)
-    if os.path.isfile(lid_saved):
-        lids = np.load(lid_saved)
-        lids = np.insert(lids, 0, lid_X)
-        print(lids)
-
-    if os.path.isfile(acc_saved):
-        data = np.load(acc_saved)
-        acc_train = data[0][:]
-        acc_test = data[1][:]
-
-        acc_train = np.insert(acc_train, 0, 0.)
-        acc_test = np.insert(acc_test, 0, 0.)
-
-    plot(model_name, dataset, noise_ratio, lids, acc_train, acc_test)
-
-
-def plot(model_name, dataset, noise_ratio, lids, acc_train, acc_test):
-    """
-    plot function
-    """
-    # plot
-    fig = plt.figure()  # figsize=(7, 6)
-    xnew = np.arange(0, len(lids), 1)
-
-    lids = lids[xnew]
-    acc_train = acc_train[xnew]
-    acc_test = acc_test[xnew]
-
-    ax = fig.add_subplot(111)
-    ax.plot(xnew, lids, c='r', marker='o', markersize=3, linewidth=2, label='LID score')
-
-    ax2 = ax.twinx()
-    ax2.plot(xnew, acc_train, c='b', marker='x', markersize=3, linewidth=2, label='Train acc')
-    ax2.plot(xnew, acc_test, c='c', marker='^', markersize=3, linewidth=2, label='Test acc')
-
-    # ax.set_xticks([])
-    # ax.set_yticks([])
-    ax.set_xlabel("Epoch", fontsize=15)
-    ax.set_ylabel("Subspace dimensionality (LID score)", fontsize=15)
-    ax2.set_ylabel("Train/test accuracy", fontsize=15)
-    # ax.set_title("%s with %s%% noisy labels" % (dataset.upper(), noise_ratio), fontsize=15)
-
-    if dataset == 'mnist':
-        ax.set_ylim((4, 22))  # for mnist
-        ax2.set_ylim((0.2, 1.2))
-    elif dataset == 'svhn':
-        ax.set_ylim((7, 20)) # for svhn
-        ax2.set_ylim((0.2, 1.2))
-    elif dataset == 'cifar-10':
-        ax.set_ylim((2.5, 12.5))  # for cifar-10
-        ax2.set_ylim((0.2, 1.2))
-    elif dataset == 'cifar-100':
-        ax.set_ylim((3, 12))  # for cifar-100
-        ax2.set_ylim((0., 1.))
-
-    legend = ax.legend(loc='upper left')
-    plt.setp(legend.get_texts(), fontsize=15)
-    legend2 = ax2.legend(loc='upper right')
-    plt.setp(legend2.get_texts(), fontsize=15)
-    fig.savefig("plots/lid_trend_%s_%s_%s.png" % (model_name, dataset, noise_ratio), dpi=300)
-    plt.show()
-
-
-def lid_trend_of_learning_models(model_list=['ce'], dataset='mnist', noise_ratio=0):
-    """
-    The LID trend of different learning models throughout.
-    """
-    # plot initialization
-    fig = plt.figure()  # figsize=(7, 6)
-    ax = fig.add_subplot(111)
-
-    # get LID of raw inputs
-    lid_subset = 128
-    k = 20
-    X_train, Y_train, X_test, Y_test = get_data(dataset)
-    rand_idxes = np.random.choice(X_train.shape[0], lid_subset * 10, replace=False)
-    X_train = X_train[rand_idxes]
-    X_train = X_train.reshape((X_train.shape[0], -1))
-
-    lid_tmp = []
-    for i in range(10):
-        s = i * 128
-        e = (i + 1) * 128
-        lid_tmp.extend(mle_batch(X_train[s:e], X_train[s:e], k=k))
-    lid_X = np.mean(lid_tmp)
-    print('LID of input X: ', lid_X)
-
-    for model_name in model_list:
-        file_name = "log/lid_%s_%s_%s.npy" % (model_name, dataset, noise_ratio)
-        if os.path.isfile(file_name):
-            lids = np.load(file_name)
-            # insert lid of raw input X
-            lids = np.insert(lids, 0, lid_X)
-            print(lids)
-
-            # Find indicies that you need to replace
-            inds = np.where(np.isnan(lids))
-            lids[inds] = np.nanmean(lids)
-            # smooth for plot
-            lids[lids < 0] = 0
-            lids[lids > 10] = 10
-
-            xnew = np.arange(0, len(lids), 1)
-            lids = lids[xnew]
-
-            # plot line
-            idx = MODELS.index(model_name)
-            ax.plot(xnew, lids, c=COLORS[idx], marker=MARKERS[idx], markersize=3, linewidth=2, label=MODEL_LABELS[idx])
-
-    ax.set_xlabel("Epoch", fontsize=15)
-    ax.set_ylabel("Subspace dimensionality (LID score)", fontsize=15)
-    # ax.set_title("%s with %s%% noisy labels" % (dataset.upper(), noise_ratio), fontsize=15)
-    legend = plt.legend(loc='lower center', ncol=2)
-    plt.setp(legend.get_texts(), fontsize=15)
-    fig.savefig("plots/lid_trend_all_models_%s_%s.png" % (dataset, noise_ratio), dpi=300)
-    plt.show()
-
-if __name__ == "__main__":
-    lid_trend_through_training(model_name='ce', dataset='cifar-100', noise_type='sym', noise_ratio=0.)
-    # lid_trend_of_learning_models(model_list=['ce', 'forward', 'backward', 'boot_hard', 'boot_soft', 'lid_dataset'],
-    #                              dataset='cifar-10', noise_ratio=60)
diff --git a/loss.py b/loss.py
index 60b5aa4..73e0e59 100644
--- a/loss.py
+++ b/loss.py
@@ -1,146 +1,526 @@
+import torch
+import torch.nn.functional as F
 import numpy as np
-from keras import backend as K
-import tensorflow as tf
+import mlconfig
+mlconfig.register(torch.nn.CrossEntropyLoss)
 
-
-def symmetric_cross_entropy(alpha, beta):
-    """
-    Symmetric Cross Entropy: 
-    ICCV2019 "Symmetric Cross Entropy for Robust Learning with Noisy Labels" 
-    https://arxiv.org/abs/1908.06112
-    """
-    def loss(y_true, y_pred):
-        y_true_1 = y_true
-        y_pred_1 = y_pred
-
-        y_true_2 = y_true
-        y_pred_2 = y_pred
-
-        y_pred_1 = tf.clip_by_value(y_pred_1, 1e-7, 1.0)
-        y_true_2 = tf.clip_by_value(y_true_2, 1e-4, 1.0)
-
-        return alpha*tf.reduce_mean(-tf.reduce_sum(y_true_1 * tf.log(y_pred_1), axis = -1)) + beta*tf.reduce_mean(-tf.reduce_sum(y_pred_2 * tf.log(y_true_2), axis = -1))
-    return loss
-
-def cross_entropy(y_true, y_pred):
-    return K.categorical_crossentropy(y_true, y_pred)
-
-
-def boot_soft(y_true, y_pred):
-    """
-    2015 - iclrws - Training deep neural networks on noisy labels with bootstrapping.
-    https://arxiv.org/abs/1412.6596
-
-    :param y_true: 
-    :param y_pred: 
-    :return: 
-    """
-    beta = 0.95
-
-    y_pred /= K.sum(y_pred, axis=-1, keepdims=True)
-    y_pred = K.clip(y_pred, K.epsilon(), 1.0 - K.epsilon())
-    return -K.sum((beta * y_true + (1. - beta) * y_pred) *
-                  K.log(y_pred), axis=-1)
-
-
-def boot_hard(y_true, y_pred):
-    """
-    2015 - iclrws - Training deep neural networks on noisy labels with bootstrapping.
-    https://arxiv.org/abs/1412.6596
-
-    :param y_true: 
-    :param y_pred: 
-    :return: 
-    """
-    beta = 0.8
-
-    y_pred /= K.sum(y_pred, axis=-1, keepdims=True)
-    y_pred = K.clip(y_pred, K.epsilon(), 1.0 - K.epsilon())
-    pred_labels = K.one_hot(K.argmax(y_pred, 1), num_classes=K.shape(y_true)[1])
-    return -K.sum((beta * y_true + (1. - beta) * pred_labels) *
-                  K.log(y_pred), axis=-1)
-
-
-def forward(P):
-    """
-    Making Deep Neural Networks Robust to Label Noise: a Loss Correction Approach
-    CVPR17 https://arxiv.org/abs/1609.03683
-    :param P: noise model, a noisy label transition probability matrix
-    :return: 
-    """
-    P = K.constant(P)
-
-    def loss(y_true, y_pred):
-        y_pred /= K.sum(y_pred, axis=-1, keepdims=True)
-        y_pred = K.clip(y_pred, K.epsilon(), 1.0 - K.epsilon())
-        return -K.sum(y_true * K.log(K.dot(y_pred, P)), axis=-1)
-
-    return loss
-
-
-def backward(P):
-    """
-    Making Deep Neural Networks Robust to Label Noise: a Loss Correction Approach
-    CVPR17 https://arxiv.org/abs/1609.03683
-    :param P: noise model, a noisy label transition probability matrix
-    :return: 
-    """
-    P_inv = K.constant(np.linalg.inv(P))
-
-    def loss(y_true, y_pred):
-        y_pred /= K.sum(y_pred, axis=-1, keepdims=True)
-        y_pred = K.clip(y_pred, K.epsilon(), 1.0 - K.epsilon())
-        return -K.sum(K.dot(y_true, P_inv) * K.log(y_pred), axis=-1)
-
-    return loss
-
-
-def lid(logits, k=20):
-    """
-    Calculate LID for each data point in the array.
-
-    :param logits:
-    :param k: 
-    :return: 
-    """
-    batch_size = tf.shape(logits)[0]
-    # n_samples = logits.get_shape().as_list()
-    # calculate pairwise distance
-    r = tf.reduce_sum(logits * logits, 1)
-    # turn r into column vector
-    r1 = tf.reshape(r, [-1, 1])
-    D = r1 - 2 * tf.matmul(logits, tf.transpose(logits)) + tf.transpose(r1) + \
-        tf.ones([batch_size, batch_size])
-
-    # find the k nearest neighbor
-    D1 = -tf.sqrt(D)
-    D2, _ = tf.nn.top_k(D1, k=k, sorted=True)
-    D3 = -D2[:, 1:]  # skip the x-to-x distance 0 by using [,1:]
-
-    m = tf.transpose(tf.multiply(tf.transpose(D3), 1.0 / D3[:, -1]))
-    v_log = tf.reduce_sum(tf.log(m + K.epsilon()), axis=1)  # to avoid nan
-    lids = -k / v_log
-
-    return lids
-
-
-def lid_paced_loss(alpha=1.0, beta1=0.1, beta2=1.0):
-    """TO_DO
-    Class wise lid pace learning, targeting classwise asymetric label noise.
-
-    Args:      
-      alpha: lid based adjustment paramter: this needs real-time update.
-    Returns:
-      Loss tensor of type float.
-    """
-    if alpha == 1.0:
-        return symmetric_cross_entropy(alpha=beta1, beta=beta2)
+if torch.cuda.is_available():
+    torch.backends.cudnn.benchmark = True
+    if torch.cuda.device_count() > 1:
+        device = torch.device('cuda:0')
     else:
-        def loss(y_true, y_pred):
-            pred_labels = K.one_hot(K.argmax(y_pred, 1), num_classes=K.shape(y_true)[1])
-            y_new = alpha * y_true + (1. - alpha) * pred_labels
-            y_pred /= K.sum(y_pred, axis=-1, keepdims=True)
-            y_pred = K.clip(y_pred, K.epsilon(), 1.0 - K.epsilon())
-            return -K.sum(y_new * K.log(y_pred), axis=-1)
+        device = torch.device('cuda')
+else:
+    device = torch.device('cpu')
+
+
+@mlconfig.register
+class SCELoss(torch.nn.Module):
+    def __init__(self, alpha, beta, num_classes=10):
+        super(SCELoss, self).__init__()
+        self.device = device
+        self.alpha = alpha
+        self.beta = beta
+        self.num_classes = num_classes
+        self.cross_entropy = torch.nn.CrossEntropyLoss()
+
+    def forward(self, pred, labels):
+        # CCE
+        ce = self.cross_entropy(pred, labels)
+
+        # RCE
+        pred = F.softmax(pred, dim=1)
+        pred = torch.clamp(pred, min=1e-7, max=1.0)
+        label_one_hot = torch.nn.functional.one_hot(labels, self.num_classes).float().to(self.device)
+        label_one_hot = torch.clamp(label_one_hot, min=1e-4, max=1.0)
+        rce = (-1*torch.sum(pred * torch.log(label_one_hot), dim=1))
+
+        # Loss
+        loss = self.alpha * ce + self.beta * rce.mean()
+        return loss
+
 
+@mlconfig.register
+class ReverseCrossEntropy(torch.nn.Module):
+    def __init__(self, num_classes, scale=1.0):
+        super(ReverseCrossEntropy, self).__init__()
+        self.device = device
+        self.num_classes = num_classes
+        self.scale = scale
+
+    def forward(self, pred, labels):
+        pred = F.softmax(pred, dim=1)
+        pred = torch.clamp(pred, min=1e-7, max=1.0)
+        label_one_hot = torch.nn.functional.one_hot(labels, self.num_classes).float().to(self.device)
+        label_one_hot = torch.clamp(label_one_hot, min=1e-4, max=1.0)
+        rce = (-1*torch.sum(pred * torch.log(label_one_hot), dim=1))
+        return self.scale * rce.mean()
+
+
+@mlconfig.register
+class NormalizedReverseCrossEntropy(torch.nn.Module):
+    def __init__(self, num_classes, scale=1.0):
+        super(NormalizedReverseCrossEntropy, self).__init__()
+        self.device = device
+        self.num_classes = num_classes
+        self.scale = scale
+
+    def forward(self, pred, labels):
+        pred = F.softmax(pred, dim=1)
+        pred = torch.clamp(pred, min=1e-7, max=1.0)
+        label_one_hot = torch.nn.functional.one_hot(labels, self.num_classes).float().to(self.device)
+        label_one_hot = torch.clamp(label_one_hot, min=1e-4, max=1.0)
+        normalizor = 1 / 4 * (self.num_classes - 1)
+        rce = (-1*torch.sum(pred * torch.log(label_one_hot), dim=1))
+        return self.scale * normalizor * rce.mean()
+
+
+@mlconfig.register
+class NormalizedCrossEntropy(torch.nn.Module):
+    def __init__(self, num_classes, scale=1.0):
+        super(NormalizedCrossEntropy, self).__init__()
+        self.device = device
+        self.num_classes = num_classes
+        self.scale = scale
+
+    def forward(self, pred, labels):
+        pred = F.log_softmax(pred, dim=1)
+        label_one_hot = torch.nn.functional.one_hot(labels, self.num_classes).float().to(self.device)
+        nce = -1 * torch.sum(label_one_hot * pred, dim=1) / (- pred.sum(dim=1))
+        return self.scale * nce.mean()
+
+
+@mlconfig.register
+class GeneralizedCrossEntropy(torch.nn.Module):
+    def __init__(self, num_classes, q=0.7):
+        super(GeneralizedCrossEntropy, self).__init__()
+        self.device = device
+        self.num_classes = num_classes
+        self.q = q
+
+    def forward(self, pred, labels):
+        pred = F.softmax(pred, dim=1)
+        pred = torch.clamp(pred, min=1e-7, max=1.0)
+        label_one_hot = torch.nn.functional.one_hot(labels, self.num_classes).float().to(self.device)
+        gce = (1. - torch.pow(torch.sum(label_one_hot * pred, dim=1), self.q)) / self.q
+        return gce.mean()
+
+
+@mlconfig.register
+class NormalizedGeneralizedCrossEntropy(torch.nn.Module):
+    def __init__(self, num_classes, scale=1.0, q=0.7):
+        super(NormalizedGeneralizedCrossEntropy, self).__init__()
+        self.device = device
+        self.num_classes = num_classes
+        self.q = q
+        self.scale = scale
+
+    def forward(self, pred, labels):
+        pred = F.softmax(pred, dim=1)
+        pred = torch.clamp(pred, min=1e-7, max=1.0)
+        label_one_hot = torch.nn.functional.one_hot(labels, self.num_classes).float().to(self.device)
+        numerators = 1. - torch.pow(torch.sum(label_one_hot * pred, dim=1), self.q)
+        denominators = self.num_classes - pred.pow(self.q).sum(dim=1)
+        ngce = numerators / denominators
+        return self.scale * ngce.mean()
+
+
+@mlconfig.register
+class MeanAbsoluteError(torch.nn.Module):
+    def __init__(self, num_classes, scale=1.0):
+        super(MeanAbsoluteError, self).__init__()
+        self.device = device
+        self.num_classes = num_classes
+        self.scale = scale
+        return
+
+    def forward(self, pred, labels):
+        pred = F.softmax(pred, dim=1)
+        label_one_hot = torch.nn.functional.one_hot(labels, self.num_classes).float().to(self.device)
+        mae = 1. - torch.sum(label_one_hot * pred, dim=1)
+        # Note: Reduced MAE
+        # Original: torch.abs(pred - label_one_hot).sum(dim=1)
+        # $MAE = \sum_{k=1}^{K} |\bm{p}(k|\bm{x}) - \bm{q}(k|\bm{x})|$
+        # $MAE = \sum_{k=1}^{K}\bm{p}(k|\bm{x}) - p(y|\bm{x}) + (1 - p(y|\bm{x}))$
+        # $MAE = 2 - 2p(y|\bm{x})$
+        #
+        return self.scale * mae.mean()
+
+
+@mlconfig.register
+class NormalizedMeanAbsoluteError(torch.nn.Module):
+    def __init__(self, num_classes, scale=1.0):
+        super(NormalizedMeanAbsoluteError, self).__init__()
+        self.device = device
+        self.num_classes = num_classes
+        self.scale = scale
+        return
+
+    def forward(self, pred, labels):
+        pred = F.softmax(pred, dim=1)
+        label_one_hot = torch.nn.functional.one_hot(labels, self.num_classes).float().to(self.device)
+        normalizor = 1 / (2 * (self.num_classes - 1))
+        mae = 1. - torch.sum(label_one_hot * pred, dim=1)
+        return self.scale * normalizor * mae.mean()
+
+
+@mlconfig.register
+class NCEandRCE(torch.nn.Module):
+    def __init__(self, alpha, beta, num_classes):
+        super(NCEandRCE, self).__init__()
+        self.num_classes = num_classes
+        self.nce = NormalizedCrossEntropy(scale=alpha, num_classes=num_classes)
+        self.rce = ReverseCrossEntropy(scale=beta, num_classes=num_classes)
+
+    def forward(self, pred, labels):
+        return self.nce(pred, labels) + self.rce(pred, labels)
+
+
+@mlconfig.register
+class NCEandMAE(torch.nn.Module):
+    def __init__(self, alpha, beta, num_classes):
+        super(NCEandMAE, self).__init__()
+        self.num_classes = num_classes
+        self.nce = NormalizedCrossEntropy(scale=alpha, num_classes=num_classes)
+        self.mae = MeanAbsoluteError(scale=beta, num_classes=num_classes)
+
+    def forward(self, pred, labels):
+        return self.nce(pred, labels) + self.mae(pred, labels)
+
+
+@mlconfig.register
+class GCEandMAE(torch.nn.Module):
+    def __init__(self, alpha, beta, num_classes, q=0.7):
+        super(GCEandMAE, self).__init__()
+        self.num_classes = num_classes
+        self.gce = GeneralizedCrossEntropy(num_classes=num_classes, q=q)
+        self.mae = MeanAbsoluteError(scale=beta, num_classes=num_classes)
+
+    def forward(self, pred, labels):
+        return self.gce(pred, labels) + self.mae(pred, labels)
+
+
+@mlconfig.register
+class GCEandRCE(torch.nn.Module):
+    def __init__(self, alpha, beta, num_classes, q=0.7):
+        super(GCEandRCE, self).__init__()
+        self.num_classes = num_classes
+        self.gce = GeneralizedCrossEntropy(num_classes=num_classes, q=q)
+        self.rce = ReverseCrossEntropy(scale=beta, num_classes=num_classes)
+
+    def forward(self, pred, labels):
+        return self.gce(pred, labels) + self.rce(pred, labels)
+
+
+@mlconfig.register
+class GCEandNCE(torch.nn.Module):
+    def __init__(self, alpha, beta, num_classes, q=0.7):
+        super(GCEandNCE, self).__init__()
+        self.num_classes = num_classes
+        self.gce = GeneralizedCrossEntropy(num_classes=num_classes, q=q)
+        self.nce = NormalizedCrossEntropy(num_classes=num_classes)
+
+    def forward(self, pred, labels):
+        return self.gce(pred, labels) + self.nce(pred, labels)
+
+
+@mlconfig.register
+class NGCEandNCE(torch.nn.Module):
+    def __init__(self, alpha, beta, num_classes, q=0.7):
+        super(NGCEandNCE, self).__init__()
+        self.num_classes = num_classes
+        self.ngce = NormalizedGeneralizedCrossEntropy(scale=alpha, q=q, num_classes=num_classes)
+        self.nce = NormalizedCrossEntropy(scale=beta, num_classes=num_classes)
+
+    def forward(self, pred, labels):
+        return self.ngce(pred, labels) + self.nce(pred, labels)
+
+
+@mlconfig.register
+class NGCEandMAE(torch.nn.Module):
+    def __init__(self, alpha, beta, num_classes, q=0.7):
+        super(NGCEandMAE, self).__init__()
+        self.num_classes = num_classes
+        self.ngce = NormalizedGeneralizedCrossEntropy(scale=alpha, q=q, num_classes=num_classes)
+        self.mae = MeanAbsoluteError(scale=beta, num_classes=num_classes)
+
+    def forward(self, pred, labels):
+        return self.ngce(pred, labels) + self.mae(pred, labels)
+
+
+@mlconfig.register
+class NGCEandRCE(torch.nn.Module):
+    def __init__(self, alpha, beta, num_classes, q=0.7):
+        super(NGCEandRCE, self).__init__()
+        self.num_classes = num_classes
+        self.ngce = NormalizedGeneralizedCrossEntropy(scale=alpha, q=q, num_classes=num_classes)
+        self.rce = ReverseCrossEntropy(scale=beta, num_classes=num_classes)
+
+    def forward(self, pred, labels):
+        return self.ngce(pred, labels) + self.rce(pred, labels)
+
+
+@mlconfig.register
+class MAEandRCE(torch.nn.Module):
+    def __init__(self, alpha, beta, num_classes):
+        super(MAEandRCE, self).__init__()
+        self.num_classes = num_classes
+        self.mae = MeanAbsoluteError(scale=alpha, num_classes=num_classes)
+        self.rce = ReverseCrossEntropy(scale=beta, num_classes=num_classes)
+
+    def forward(self, pred, labels):
+        return self.mae(pred, labels) + self.rce(pred, labels)
+
+
+@mlconfig.register
+class NLNL(torch.nn.Module):
+    def __init__(self, train_loader, num_classes, ln_neg=1):
+        super(NLNL, self).__init__()
+        self.device = device
+        self.num_classes = num_classes
+        self.ln_neg = ln_neg
+        weight = torch.FloatTensor(num_classes).zero_() + 1.
+        if not hasattr(train_loader.dataset, 'targets'):
+            weight = [1] * num_classes
+            weight = torch.FloatTensor(weight)
+        else:
+            for i in range(num_classes):
+                weight[i] = (torch.from_numpy(np.array(train_loader.dataset.targets)) == i).sum()
+            weight = 1 / (weight / weight.max())
+        self.weight = weight.to(self.device)
+        self.criterion = torch.nn.CrossEntropyLoss(weight=self.weight)
+        self.criterion_nll = torch.nn.NLLLoss()
+
+    def forward(self, pred, labels):
+        labels_neg = (labels.unsqueeze(-1).repeat(1, self.ln_neg)
+                      + torch.LongTensor(len(labels), self.ln_neg).to(self.device).random_(1, self.num_classes)) % self.num_classes
+        labels_neg = torch.autograd.Variable(labels_neg)
+
+        assert labels_neg.max() <= self.num_classes-1
+        assert labels_neg.min() >= 0
+        assert (labels_neg != labels.unsqueeze(-1).repeat(1, self.ln_neg)).sum() == len(labels)*self.ln_neg
+
+        s_neg = torch.log(torch.clamp(1. - F.softmax(pred, 1), min=1e-5, max=1.))
+        s_neg *= self.weight[labels].unsqueeze(-1).expand(s_neg.size()).to(self.device)
+        labels = labels * 0 - 100
+        loss = self.criterion(pred, labels) * float((labels >= 0).sum())
+        loss_neg = self.criterion_nll(s_neg.repeat(self.ln_neg, 1), labels_neg.t().contiguous().view(-1)) * float((labels_neg >= 0).sum())
+        loss = ((loss+loss_neg) / (float((labels >= 0).sum())+float((labels_neg[:, 0] >= 0).sum())))
         return loss
+
+
+@mlconfig.register
+class FocalLoss(torch.nn.Module):
+    '''
+        https://github.com/clcarwin/focal_loss_pytorch/blob/master/focalloss.py
+    '''
+
+    def __init__(self, gamma=0, alpha=None, size_average=True):
+        super(FocalLoss, self).__init__()
+        self.gamma = gamma
+        self.alpha = alpha
+        if isinstance(alpha, (float, int)):
+            self.alpha = torch.Tensor([alpha, 1-alpha])
+        if isinstance(alpha, list):
+            self.alpha = torch.Tensor(alpha)
+        self.size_average = size_average
+
+    def forward(self, input, target):
+        if input.dim() > 2:
+            input = input.view(input.size(0), input.size(1), -1)  # N,C,H,W => N,C,H*W
+            input = input.transpose(1, 2)                         # N,C,H*W => N,H*W,C
+            input = input.contiguous().view(-1, input.size(2))    # N,H*W,C => N*H*W,C
+        target = target.view(-1, 1)
+
+        logpt = F.log_softmax(input, dim=1)
+        logpt = logpt.gather(1, target)
+        logpt = logpt.view(-1)
+        pt = torch.autograd.Variable(logpt.data.exp())
+
+        if self.alpha is not None:
+            if self.alpha.type() != input.data.type():
+                self.alpha = self.alpha.type_as(input.data)
+            at = self.alpha.gather(0, target.data.view(-1))
+            logpt = logpt * torch.autograd.Variable(at)
+
+        loss = -1 * (1-pt)**self.gamma * logpt
+        if self.size_average:
+            return loss.mean()
+        else:
+            return loss.sum()
+
+
+@mlconfig.register
+class NormalizedFocalLoss(torch.nn.Module):
+    def __init__(self, scale=1.0, gamma=0, num_classes=10, alpha=None, size_average=True):
+        super(NormalizedFocalLoss, self).__init__()
+        self.gamma = gamma
+        self.size_average = size_average
+        self.num_classes = num_classes
+        self.scale = scale
+
+    def forward(self, input, target):
+        target = target.view(-1, 1)
+        logpt = F.log_softmax(input, dim=1)
+        normalizor = torch.sum(-1 * (1 - logpt.data.exp()) ** self.gamma * logpt, dim=1)
+        logpt = logpt.gather(1, target)
+        logpt = logpt.view(-1)
+        pt = torch.autograd.Variable(logpt.data.exp())
+        loss = -1 * (1-pt)**self.gamma * logpt
+        loss = self.scale * loss / normalizor
+
+        if self.size_average:
+            return loss.mean()
+        else:
+            return loss.sum()
+
+
+@mlconfig.register
+class NFLandNCE(torch.nn.Module):
+    def __init__(self, alpha, beta, num_classes, gamma=0.5):
+        super(NFLandNCE, self).__init__()
+        self.num_classes = num_classes
+        self.nfl = NormalizedFocalLoss(scale=alpha, gamma=gamma, num_classes=num_classes)
+        self.nce = NormalizedCrossEntropy(scale=beta, num_classes=num_classes)
+
+    def forward(self, pred, labels):
+        return self.nfl(pred, labels) + self.nce(pred, labels)
+
+
+@mlconfig.register
+class NFLandMAE(torch.nn.Module):
+    def __init__(self, alpha, beta, num_classes, gamma=0.5):
+        super(NFLandMAE, self).__init__()
+        self.num_classes = num_classes
+        self.nfl = NormalizedFocalLoss(scale=alpha, gamma=gamma, num_classes=num_classes)
+        self.mae = MeanAbsoluteError(scale=beta, num_classes=num_classes)
+
+    def forward(self, pred, labels):
+        return self.nfl(pred, labels) + self.mae(pred, labels)
+
+
+@mlconfig.register
+class NFLandRCE(torch.nn.Module):
+    def __init__(self, alpha, beta, num_classes, gamma=0.5):
+        super(NFLandRCE, self).__init__()
+        self.num_classes = num_classes
+        self.nfl = NormalizedFocalLoss(scale=alpha, gamma=gamma, num_classes=num_classes)
+        self.rce = ReverseCrossEntropy(scale=beta, num_classes=num_classes)
+
+    def forward(self, pred, labels):
+        return self.nfl(pred, labels) + self.rce(pred, labels)
+
+
+@mlconfig.register
+class DMILoss(torch.nn.Module):
+    def __init__(self, num_classes):
+        super(DMILoss, self).__init__()
+        self.num_classes = num_classes
+
+    def forward(self, output, target):
+        outputs = F.softmax(output, dim=1)
+        targets = target.reshape(target.size(0), 1).cpu()
+        y_onehot = torch.FloatTensor(target.size(0), self.num_classes).zero_()
+        y_onehot.scatter_(1, targets, 1)
+        y_onehot = y_onehot.transpose(0, 1).cuda()
+        mat = y_onehot @ outputs
+        return -1.0 * torch.log(torch.abs(torch.det(mat.float())) + 0.001)
+
+@mlconfig.register
+class BootSoftLoss(torch.nn.Module):
+    def __init__(self, num_classes, beta=0.95):
+        super(BootSoftLoss, self).__init__()
+        self.device = device
+        self.num_classes = num_classes
+        self.beta = beta
+
+    def forward(self, pred, labels):
+        pred = F.softmax(pred, dim=1)
+        pred = torch.clamp(pred, min=1e-7, max=1.0)
+        label_one_hot = F.one_hot(labels, self.num_classes).float().to(self.device)
+        label_one_hot = torch.clamp(label_one_hot, min=1e-4, max=1.0)
+        bsl = -torch.sum((self.beta * label_one_hot + (1. - self.beta) * pred) * torch.log(pred), dim=1)
+        return bsl.mean()
+    
+@mlconfig.register
+class BootHardLoss(torch.nn.Module):
+    def __init__(self, num_classes, beta=0.8):
+        super(BootSoftLoss, self).__init__()
+        self.device = device
+        self.num_classes = num_classes
+        self.beta = beta
+
+    def forward(self, pred, labels):
+        pred = F.softmax(pred, dim=1)
+        pred = torch.clamp(pred, min=1e-7, max=1.0)
+        label_one_hot = F.one_hot(labels, self.num_classes).float().to(self.device)
+        label_one_hot = torch.clamp(label_one_hot, min=1e-4, max=1.0)
+        pred_one_hot = F.one_hot(torch.argmax(pred, dim=1),self.num_classes)
+        bhl = -torch.sum((self.beta * label_one_hot + (1. - self.beta) * pred_one_hot) * torch.log(pred), dim=1)
+        return bhl.mean()
+    
+@mlconfig.register
+class ForwardLoss(torch.nn.Module):
+    def __init__(self, num_classes, noise_rate):
+        super(ForwardLoss, self).__init__()
+        self.device = device
+        self.num_classes = num_classes
+        self.noise_rate = noise_rate
+
+    def forward(self, pred, labels):
+        pred = F.softmax(pred, dim=1)
+        pred = torch.clamp(pred, min=1e-7, max=1.0)
+        label_one_hot = F.one_hot(labels, self.num_classes).float().to(self.device)
+        label_one_hot = torch.clamp(label_one_hot, min=1e-4, max=1.0)
+        assert (self.noise_rate >= 0.) and (self.noise_rate <= 1.)
+        P = self.noise_rate / (self.num_classes - 1) * torch.ones((self.num_classes, self.num_classes))
+        P.diagonal().fill_(1-self.noise_rate)
+        P = P.to(self.device)
+        loss=-torch.sum(label_one_hot * torch.log(torch.matmul(pred, P)), dim=-1)
+        return loss.mean()
+    
+    
+@mlconfig.register
+class BackwardLoss(torch.nn.Module):
+    def __init__(self, num_classes, noise_rate):
+        super(BackwardLoss, self).__init__()
+        self.device = device
+        self.num_classes = num_classes
+        self.noise_rate = noise_rate
+
+    def forward(self, pred, labels):
+        pred = F.softmax(pred, dim=1)
+        pred = torch.clamp(pred, min=1e-7, max=1.0)
+        label_one_hot = F.one_hot(labels, self.num_classes).float().to(self.device)
+        label_one_hot = torch.clamp(label_one_hot, min=1e-4, max=1.0)
+        assert (self.noise_rate >= 0.) and (self.noise_rate <= 1.)
+        P = self.noise_rate / (self.num_classes - 1) * torch.ones((self.num_classes, self.num_classes))
+        P.diagonal().fill_(1-self.noise_rate)
+        P = P.to(self.device)
+        P_inv = torch.inverse(P)
+        loss=-torch.sum((torch.matmul(label_one_hot, P_inv)) * torch.log(pred), dim=-1)
+        return loss.mean()
+    
+@mlconfig.register
+class LIDPacedLoss(torch.nn.Module):
+    def __init__(self, num_classes, alpha, beta1, beta2):
+        super(LIDPacedLoss, self).__init__()
+        self.device = device
+        self.num_classes = num_classes
+        self.alpha = alpha
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.sce = SCELoss(alpha=beta1, beta=beta2, num_classes=num_classes)
+
+    def forward(self, pred, labels):
+        if self.alpha == 1.0:
+            return self.sce(pred, labels)
+        else:
+            pred = F.softmax(pred, dim=1)
+            pred = torch.clamp(pred, min=1e-7, max=1.0)
+            label_one_hot = F.one_hot(labels, self.num_classes).float().to(self.device)
+            label_one_hot = torch.clamp(label_one_hot, min=1e-4, max=1.0)
+            pred_labels = F.one_hot(torch.argmax(pred, dim=1), num_classes=label_one_hot.size(1))
+            y_new = self.alpha * label_one_hot + (1. - self.alpha) * pred_labels
+            loss = -torch.sum(y_new * torch.log(pred), dim=-1)
+            return loss.mean()
\ No newline at end of file
diff --git a/loss_acc_plot.py b/loss_acc_plot.py
deleted file mode 100644
index 58259d3..0000000
--- a/loss_acc_plot.py
+++ /dev/null
@@ -1,131 +0,0 @@
-"""
-Train test error/accuracy/loss plot.
-
-Author: Xingjun Ma
-"""
-import os
-import numpy as np
-import tensorflow as tf
-import keras.backend as K
-from keras.datasets import mnist, cifar10
-from keras.optimizers import SGD
-from keras.utils import to_categorical
-import matplotlib.pyplot as plt
-from sklearn.decomposition import PCA
-from util import get_lids_random_batch
-from datasets import get_data, validatation_split
-from models import get_model
-from loss import cross_entropy, boot_soft, boot_hard
-from lass_tf import lass
-
-np.random.seed(1024)
-
-# MODELS = ['ce', 'd2l', 'backward', 'boot_soft', 'boot_hard', 'forward']
-
-MODELS = ['ce', 'forward', 'backward', 'boot_soft', 'boot_hard', 'd2l']
-MODEL_LABELS = ['cross-entropy', 'forward', 'backward', 'boot-soft', 'boot-hard', 'D2L']
-COLORS = ['r', 'y', 'c', 'm', 'g', 'b']
-MARKERS = ['x', 'D', '<', '>', '^', 'o']
-
-def test_acc(model_list, dataset='mnist', noise_ratio=0.):
-    """
-    Test acc throughout training.
-    """
-    print('Dataset: %s, noise ratio: %s%%' % (dataset, noise_ratio))
-
-    # plot initialization
-    fig = plt.figure()  # figsize=(7, 6)
-    ax = fig.add_subplot(111)
-
-    for model_name in model_list:
-        file_name = 'log/acc_%s_%s_%s.npy' % \
-                    (model_name, dataset, noise_ratio)
-        if os.path.isfile(file_name):
-            accs = np.load(file_name)
-            train_accs = accs[0]
-            test_accs = accs[1]
-            # print(test_accs)
-
-            # plot line
-            idx = MODELS.index(model_name)
-
-            xnew = np.arange(0, len(test_accs), 1)
-            test_accs = test_accs[xnew]
-            ax.plot(xnew, test_accs, c=COLORS[idx], marker=MARKERS[idx], markersize=3, linewidth=2, label=MODEL_LABELS[idx])
-
-    # ax.set_xticks([])
-    # ax.set_yticks([])
-    ax.set_xlabel("Epoch", fontsize=15)
-    ax.set_ylabel("Test accuracy", fontsize=15)
-    # ax.set_title("%s with %s%% noisy labels" % (dataset.upper(), noise_ratio), fontsize=15)
-    legend = plt.legend(loc='lower right', ncol=2)
-    plt.setp(legend.get_texts(), fontsize=15)
-    fig.savefig("plots/test_acc_trend_all_models_%s_%s.png" % (dataset, noise_ratio), dpi=300)
-    plt.show()
-
-
-def test_acc_last_epoch(model_list, dataset='mnist', num_classes=10, noise_ratio=10, epochs=50):
-    """
-    Test acc throughout training.
-    """
-    print('Dataset: %s, epochs: %s, noise ratio: %s%%' % (dataset, epochs, noise_ratio))
-
-    # load data
-    _, _, X_test, Y_test = get_data(dataset)
-    # convert class vectors to binary class matrices
-    Y_test = to_categorical(Y_test, num_classes)
-
-    # load model
-    image_shape = X_test.shape[1:]
-    model = get_model(dataset, input_tensor=None, input_shape=image_shape)
-    sgd = SGD(lr=0.01, momentum=0.9)
-
-    for model_name in model_list:
-        # the critical sample ratio of the representations learned at every epoch
-        model_path = 'model/%s_%s_%s.hdf5' % (model_name, dataset, noise_ratio)
-        model.load_weights(model_path)
-        model.compile(
-            loss=cross_entropy,
-            optimizer=sgd,
-            metrics=['accuracy']
-        )
-
-        _, test_acc = model.evaluate(X_test, Y_test, batch_size=128, verbose=0)
-        print('model: %s, epoch: %s, test_acc: %s' % (model_name, epochs-1, test_acc))
-
-def print_loss_acc_log(model_list, dataset='mnist', noise_ratio=0.1):
-    """
-    Test acc throughout training.
-
-    :param model_list:
-    :param dataset:
-    :param noise_ratio:
-    :return: 
-    """
-    print('Dataset: %s, noise ratio: %s' % (dataset, noise_ratio))
-
-    for model_name in model_list:
-        loss_file = 'log/loss_%s_%s_%s.npy' % \
-                   (model_name, dataset, noise_ratio)
-        acc_file = 'log/acc_%s_%s_%s.npy' % \
-                    (model_name, dataset, noise_ratio)
-        if os.path.isfile(loss_file):
-            losses = np.load(loss_file)
-            # print(losses)
-            val_loss = losses[1, -5:]
-            print('--------- val loss ---------')
-            print(val_loss)
-        if os.path.isfile(acc_file):
-            accs = np.load(acc_file)
-            print('ecpos: ', len(accs[1]))
-            val_acc = accs[1, -5:]
-            print('--------- val acc ---------')
-            print(val_acc)
-
-if __name__ == "__main__":
-    # mnist: epoch=50, cifar-10: epoch=120
-    # test_acc(model_list=['ce'], dataset='cifar-10', noise_ratio=40)
-
-    # test_acc_last_epoch(model_list=['ce'],
-    #                 dataset='cifar-10', num_classes=10, noise_ratio=40, epochs=120)
-    print_loss_acc_log(model_list=['boot_hard'], dataset='cifar-100',  noise_ratio=0)
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..885aca1
--- /dev/null
+++ b/main.py
@@ -0,0 +1,206 @@
+import torch
+import argparse
+import util
+import os
+import datetime
+import random
+import mlconfig
+import loss
+import models
+import dataset
+import shutil
+from evaluator import Evaluator
+from trainer import Trainer
+from util import get_lids_random_batch,get_csr_random_batch
+from callback_util import D2LCallback
+import numpy as np
+from plot import lid_trend_through_training, lid_trend_of_learning_models, test_acc_trend_of_learning_models, csr_trend_of_learning_models
+
+# ArgParse
+parser = argparse.ArgumentParser(description='Normalized Loss Functions for Deep Learning with Noisy Labels')
+# Training
+parser.add_argument('--resume', action='store_true', default=False)
+parser.add_argument('--seed', type=int, default=0)
+parser.add_argument('--config_path', type=str, default='configs')
+parser.add_argument('--version', type=str, default='ce')
+parser.add_argument('--exp_name', type=str, default="run1")
+parser.add_argument('--load_model', action='store_true', default=False)
+parser.add_argument('--data_parallel', action='store_true', default=False)
+parser.add_argument('--asym', action='store_true', default=False)
+parser.add_argument('--noise_rate', type=float, default=0.0)
+parser.add_argument('--plot', action='store_true', default=False)
+parser.add_argument('--plotall', action='store_true', default=False)
+args = parser.parse_args()
+
+# Set up
+if args.exp_name == '' or args.exp_name is None:
+    args.exp_name = 'exp_' + datetime.datetime.now()
+exp_path = os.path.join(args.exp_name, args.version)
+log_file_path = os.path.join(exp_path, args.version)
+checkpoint_path = os.path.join(exp_path, 'checkpoints')
+checkpoint_path_file = os.path.join(checkpoint_path, args.version)
+util.build_dirs(exp_path)
+util.build_dirs(checkpoint_path)
+
+logger = util.setup_logger(name=args.version, log_file=log_file_path + ".log")
+for arg in vars(args):
+    logger.info("%s: %s" % (arg, getattr(args, arg)))
+
+random.seed(args.seed)
+if torch.cuda.is_available():
+    torch.cuda.manual_seed(args.seed)
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.benchmark = True
+    device = torch.device('cuda')
+    logger.info("Using CUDA!")
+    device_list = [torch.cuda.get_device_name(i) for i in range(0, torch.cuda.device_count())]
+    logger.info("GPU List: %s" % (device_list))
+else:
+    device = torch.device('cpu')
+
+logger.info("PyTorch Version: %s" % (torch.__version__))
+config_file = os.path.join(args.config_path, args.version) + '.yaml'
+config = mlconfig.load(config_file)
+if args.version == 'fl' or args.version == 'bl':
+    config['criterion']['noise_rate']=args.noise_rate
+if args.version != 'd2l':
+    config.set_immutable()
+shutil.copyfile(config_file, os.path.join(exp_path, args.version+'.yaml'))
+for key in config:
+    logger.info("%s: %s" % (key, config[key]))
+
+
+def train(starting_epoch, model, data_loader, optimizer, scheduler, criterion, trainer, evaluator, ENV, callback, mode):
+    for epoch in range(starting_epoch, config.epochs):
+        if args.version == 'd2l':
+            if mode == 'stage2':
+                config['criterion']['alpha'] = callback.alpha
+                criterion=config.criterion()
+                
+        logger.info("="*20 + "Training" + "="*20)
+            
+        # Train
+        ENV['global_step'] = trainer.train(epoch, ENV['global_step'], model, optimizer, criterion)
+        scheduler.step()
+        
+        if args.version == 'd2l':
+            callback.on_epoch_begin(epoch)
+            if mode == 'stage1':
+                if callback.is_found_turning_point == True:
+                    break
+                
+        # Eval
+        logger.info("="*20 + "Eval" + "="*20)
+        evaluator.eval(epoch, ENV['global_step'], model, torch.nn.CrossEntropyLoss())
+        payload = ('Eval Loss:%.4f\tEval acc: %.2f' % (evaluator.loss_meters.avg, evaluator.acc_meters.avg*100))
+        logger.info(payload)
+        # LID
+        lids = get_lids_random_batch(model, data_loader, device, k=20, batch_size=128)
+        lid = lids.mean()
+        logger.info('LID:%f'%(lid))
+        # CSR
+        csr = get_csr_random_batch(model, data_loader, device)
+        logger.info('CSR:%f'%(csr))
+        
+        ENV['train_history'].append(trainer.acc_meters.avg*100)
+        ENV['eval_history'].append(evaluator.acc_meters.avg*100)
+        ENV['curren_acc'] = evaluator.acc_meters.avg*100
+        ENV['best_acc'] = max(ENV['curren_acc'], ENV['best_acc'])
+        ENV['lid'].append(lid)
+        ENV['csr'].append(csr)
+    
+
+        # Reset Stats
+        trainer._reset_stats()
+        evaluator._reset_stats()
+
+        # Save Model
+        target_model = model.module if args.data_parallel else model
+        util.save_model(ENV=ENV,
+                        epoch=epoch,
+                        model=target_model,
+                        optimizer=optimizer,
+                        scheduler=scheduler,
+                        filename=checkpoint_path_file)
+        logger.info('Model Saved at %s', checkpoint_path_file)
+        torch.cuda.empty_cache()
+    return
+
+
+def main():
+    if config.dataset.name == 'DatasetGenerator':
+        data_loader = config.dataset(seed=args.seed, noise_rate=args.noise_rate, asym=args.asym)
+    else:
+        data_loader = config.dataset()
+
+    model = config.model()
+    if isinstance(data_loader, dataset.Clothing1MDatasetLoader):
+        model.fc = torch.nn.Linear(2048, 14)
+    model = model.to(device)
+
+    data_loader = data_loader.getDataLoader()
+    logger.info("param size = %fMB", util.count_parameters_in_MB(model))
+    if args.data_parallel:
+        model = torch.nn.DataParallel(model)
+    #data_train = data_loader['train_dataset'].dataset
+    #tensor_list = []
+    #for j in range(len(data_train)):
+        #tensor_list.append(data_train[j][0])
+    
+    optimizer = config.optimizer(model.parameters())
+    scheduler = config.scheduler(optimizer)
+    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = 40, gamma = 0.1)
+    if config.criterion.name == 'NLNL':
+        criterion = config.criterion(train_loader=data_loader['train_dataset'])
+    else:
+        criterion = config.criterion()
+    trainer = Trainer(data_loader['train_dataset'], logger, config)
+    evaluator = Evaluator(data_loader['test_dataset'], logger, config)
+
+    starting_epoch = 0
+    ENV = {'global_step': 0,
+           'best_acc': 0.0,
+           'current_acc': 0.0,
+           'train_history': [],
+           'eval_history': [],
+           'lid':[],
+           'csr':[]}
+
+    if args.load_model:
+        checkpoint = util.load_model(filename=checkpoint_path_file,
+                                     model=model,
+                                     optimizer=optimizer,
+                                     scheduler=scheduler)
+        starting_epoch = checkpoint['epoch']
+        ENV = checkpoint['ENV']
+        trainer.global_step = ENV['global_step']
+        logger.info("File %s loaded!" % (checkpoint_path_file))
+    
+    idx = -5 if args.asym else -4
+    if args.plot:
+        lid_trend_through_training(exp_name=args.exp_name, dataset=args.config_path[8:idx], data_loader=data_loader, device=device, model=model, optimizer=optimizer, scheduler=scheduler, model_name=args.version, noise_type='sym', noise_ratio=args.noise_rate)
+    elif args.plotall:
+        lid_trend_of_learning_models(exp_name=args.exp_name, dataset=args.config_path[8:idx], model=model, optimizer=optimizer, scheduler=scheduler, model_list=['ce', 'fl', 'bl', 'bsl', 'bhl', 'd2l'], noise_ratio=args.noise_rate)
+        test_acc_trend_of_learning_models(exp_name=args.exp_name, dataset=args.config_path[8:idx], model=model, optimizer=optimizer, scheduler=scheduler, model_list=['ce', 'fl', 'bl', 'bsl', 'bhl', 'd2l'], noise_ratio=args.noise_rate)
+        csr_trend_of_learning_models(exp_name=args.exp_name, dataset=args.config_path[8:idx], model=model, optimizer=optimizer, scheduler=scheduler, model_list=['ce', 'fl', 'bl', 'bsl', 'bhl', 'd2l'], noise_ratio=args.noise_rate)
+    else:  
+        d2l_callback = D2LCallback(model, data_loader, device)
+        train(starting_epoch, model, data_loader, optimizer, scheduler, criterion, trainer, evaluator, ENV, d2l_callback, mode='stage1')
+        if args.version == 'd2l':
+            checkpoint = util.load_model(filename=checkpoint_path_file,
+                                         model=model,
+                                         optimizer=optimizer,
+                                         scheduler=scheduler)
+            starting_epoch = checkpoint['epoch']
+            ENV = checkpoint['ENV']
+            trainer.global_step = ENV['global_step']
+            logger.info("File %s loaded!" % (checkpoint_path_file))
+        
+            train(starting_epoch, model, data_loader, optimizer, scheduler, criterion, trainer, evaluator, ENV, d2l_callback, mode='stage2')
+    
+    
+    return
+
+
+if __name__ == '__main__':
+    main()
diff --git a/models.py b/models.py
index ec6a834..be3e50e 100644
--- a/models.py
+++ b/models.py
@@ -1,127 +1,268 @@
-import numpy as np
-import keras.backend as K
-from keras.models import Model
-from keras.regularizers import l2
-from keras.layers import Input, Conv2D, Dense, MaxPooling2D, Flatten, Activation, BatchNormalization
-from resnet import cifar100_resnet
-
-def get_model(dataset='mnist', input_tensor=None, input_shape=None, num_classes=10):
-    """
-    Takes in a parameter indicating which model type to use ('mnist',
-    'cifar-10' or 'cifar-100') and returns the appropriate Keras model.
-    :param dataset: A string indicating which dataset we are building
-                    a model for.
-    input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
-            to use as image input for the model.
-    input_shape: optional shape tuple
-    :return: The model; a Keras 'Model' instance.
-    """
-    assert dataset in ['mnist', 'svhn', 'cifar-10', 'cifar-100'], \
-        "dataset parameter must be either 'mnist', 'svhn', 'cifar-10' or 'cifar-100'"
-
-    if input_tensor is None:
-        img_input = Input(shape=input_shape)
-    else:
-        if not K.is_keras_tensor(input_shape):
-            img_input = Input(tensor=input_tensor, shape=input_shape)
-        else:
-            img_input = input_tensor
-
-    if dataset == 'mnist':
-        # ## LeNet-5 like 4-layer CNN
-        x = Conv2D(32, (3, 3), padding='same', kernel_initializer="he_normal", name='conv1')(img_input)
-        x = BatchNormalization()(x)
-        x = Activation('relu')(x)
-        x = MaxPooling2D((2, 2), strides=(2, 2), name='pool1')(x)
-
-        x = Conv2D(64, (3, 3), padding='same', kernel_initializer="he_normal", name='conv2')(x)
-        x = BatchNormalization()(x)
-        x = Activation('relu')(x)
-        x = MaxPooling2D((2, 2), strides=(2, 2), name='pool2')(x)
-
-        x = Flatten()(x)
-
-        x = Dense(128, kernel_initializer="he_normal", name='fc1')(x)
-        x = BatchNormalization()(x)
-        x = Activation('relu', name='lid')(x)
-        # x = Dropout(0.2)(x)
-
-        x = Dense(num_classes, kernel_initializer="he_normal")(x)
-        x = Activation('softmax')(x)
-
-        model = Model(img_input, x)
-
-    elif dataset == 'svhn':
-        # ## LeNet-5 like 5-layer CNN
-        x = Conv2D(64, (3, 3), padding='same', kernel_initializer='he_normal', name='conv1')(img_input)
-        x = BatchNormalization()(x)
-        x = Activation('relu')(x)
-        x = MaxPooling2D((2, 2), strides=(2, 2), name='pool1')(x)
-
-        x = Conv2D(64, (3, 3), padding='same', kernel_initializer='he_normal', name='conv2')(x)
-        x = BatchNormalization()(x)
-        x = Activation('relu')(x)
-        x = MaxPooling2D((2, 2), strides=(2, 2), name='pool2')(x)
-
-        x = Flatten()(x)
-
-        x = Dense(512, kernel_initializer='he_normal', name='fc1')(x)
-        x = BatchNormalization()(x)
-        x = Activation('relu')(x)
-
-        x = Dense(128, kernel_initializer="he_normal", name='fc2')(x)
-        x = BatchNormalization()(x)
-        x = Activation('relu', name='lid')(x)
-        # x = Dropout(0.2)(x)
-
-        x = Dense(num_classes, kernel_initializer="he_normal")(x)
-        x = Activation('softmax')(x)
-
-        model = Model(img_input, x)
-
-    elif dataset == 'cifar-10':
-        # VGG-like 8-layer CNN
-        # Block 1
-        x = Conv2D(64, (3, 3), padding='same', kernel_initializer="he_normal", name='block1_conv1')(img_input)
-        x = BatchNormalization()(x)
-        x = Activation('relu')(x)
-        x = Conv2D(64, (3, 3), padding='same', kernel_initializer="he_normal", name='block1_conv2')(x)
-        x = BatchNormalization()(x)
-        x = Activation('relu')(x)
-        x = MaxPooling2D((2, 2), strides=(2, 2), name='block1_pool')(x)
-
-        # Block 2
-        x = Conv2D(128, (3, 3), padding='same', kernel_initializer="he_normal", name='block2_conv1')(x)
-        x = BatchNormalization()(x)
-        x = Activation('relu')(x)
-        x = Conv2D(128, (3, 3), padding='same', kernel_initializer="he_normal", name='block2_conv2')(x)
-        x = BatchNormalization()(x)
-        x = Activation('relu')(x)
-        x = MaxPooling2D((2, 2), strides=(2, 2), name='block2_pool')(x)
-
-        # Block 3
-        x = Conv2D(196, (3, 3), padding='same', kernel_initializer="he_normal", name='block3_conv1')(x)
-        x = BatchNormalization()(x)
-        x = Activation('relu')(x)
-        x = Conv2D(196, (3, 3), padding='same', kernel_initializer="he_normal", name='block3_conv2')(x)
-        x = BatchNormalization()(x)
-        x = Activation('relu')(x)
-        x = MaxPooling2D((2, 2), strides=(2, 2), name='block3_pool')(x)
-
-        x = Flatten(name='flatten')(x)
-
-        x = Dense(256, kernel_initializer="he_normal", kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01), name='fc1')(x)
-        x = BatchNormalization()(x)
-        x = Activation('relu', name='lid')(x)
-
-        x = Dense(num_classes, kernel_initializer="he_normal")(x)
-        x = Activation('softmax')(x)
-
-        # Create model.
-        model = Model(img_input, x)
-
-    elif dataset == 'cifar-100':
-        # resnet
-        model = cifar100_resnet(depth=7, num_classes=num_classes)
-
-    return model
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import mlconfig
+import torchvision
+mlconfig.register(torchvision.models.resnet50)
+mlconfig.register(torch.optim.SGD)
+mlconfig.register(torch.optim.Adam)
+mlconfig.register(torch.optim.lr_scheduler.MultiStepLR)
+mlconfig.register(torch.optim.lr_scheduler.CosineAnnealingLR)
+mlconfig.register(torch.optim.lr_scheduler.StepLR)
+mlconfig.register(torch.optim.lr_scheduler.ExponentialLR)
+
+
+class ConvBrunch(nn.Module):
+    def __init__(self, in_planes, out_planes, kernel_size=3):
+        super(ConvBrunch, self).__init__()
+        padding = (kernel_size - 1) // 2
+        self.out_conv = nn.Sequential(
+            nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, padding=padding),
+            nn.BatchNorm2d(out_planes),
+            nn.ReLU())
+
+    def forward(self, x):
+        return self.out_conv(x)
+
+
+@mlconfig.register
+class ToyModel(nn.Module):
+    def __init__(self, type='CIFAR10'):
+        super(ToyModel, self).__init__()
+        self.type = type
+        """
+        if type == 'CIFAR10':
+            self.block1 = nn.Sequential(
+                ConvBrunch(3, 64, 3),
+                ConvBrunch(64, 64, 3),
+                nn.MaxPool2d(kernel_size=2, stride=2))
+            self.block2 = nn.Sequential(
+                ConvBrunch(64, 128, 3),
+                ConvBrunch(128, 128, 3),
+                nn.MaxPool2d(kernel_size=2, stride=2))
+            self.block3 = nn.Sequential(
+                ConvBrunch(128, 196, 3),
+                ConvBrunch(196, 196, 3),
+                nn.MaxPool2d(kernel_size=2, stride=2))
+            # self.global_avg_pool = nn.AdaptiveAvgPool2d(1)
+            self.fc1 = nn.Sequential(
+                nn.Linear(4*4*196, 256),
+                nn.BatchNorm1d(256),
+                nn.ReLU())
+            self.fc2 = nn.Linear(256, 10)
+            self.fc_size = 4*4*196
+            """
+        if type == 'CIFAR10':
+            self.block1 = nn.Sequential(
+                ConvBrunch(3, 32, 3),
+                ConvBrunch(32, 32, 3),
+                nn.MaxPool2d(kernel_size=2, stride=2))
+            self.block2 = nn.Sequential(
+                ConvBrunch(32, 64, 3),
+                ConvBrunch(64, 64, 3),
+                nn.MaxPool2d(kernel_size=2, stride=2))
+            self.block3 = nn.Sequential(
+                ConvBrunch(64, 128, 3),
+                ConvBrunch(128, 128, 3),
+                nn.MaxPool2d(kernel_size=2, stride=2))
+            # self.global_avg_pool = nn.AdaptiveAvgPool2d(1)
+            self.fc1 = nn.Sequential(
+                nn.Flatten(),
+                nn.Dropout(0.5),
+                nn.Linear(4*4*128, 1024),
+                nn.ReLU(),
+                nn.BatchNorm1d(1024),
+                nn.Dropout(0.5),
+                nn.Linear(1024,512),
+                nn.ReLU(),
+                nn.BatchNorm1d(512)
+            )
+            self.fc2 = nn.Sequential(
+                nn.Dropout(0.5),
+                nn.Linear(512,10)
+            )
+            self.fc_size = 4*4*128
+        
+        
+        elif type == 'MNIST':
+            self.block1 = nn.Sequential(
+                ConvBrunch(1, 64, 3),
+            )
+            self.block2 = nn.Sequential(
+                ConvBrunch(64, 64, 3),
+                nn.MaxPool2d(kernel_size=2, stride=2),
+                nn.Dropout(0.5))
+            # self.global_avg_pool = nn.AdaptiveAvgPool2d(1)
+            self.fc1 = nn.Sequential(
+                nn.Flatten(),
+                nn.Linear(64*14*14, 128),
+                nn.BatchNorm1d(128),
+                nn.ReLU(),
+                nn.Dropout(0.5))
+            self.fc2 = nn.Linear(128, 10)
+            self.fc_size = 64*14*14
+        
+        """
+        elif type == 'MNIST':
+            self.block1 = nn.Sequential(
+                ConvBrunch(1, 32, 3),
+                nn.MaxPool2d(kernel_size=2, stride=2))
+            self.block2 = nn.Sequential(
+                ConvBrunch(32, 64, 3),
+                nn.MaxPool2d(kernel_size=2, stride=2))
+            # self.global_avg_pool = nn.AdaptiveAvgPool2d(1)
+            self.fc1 = nn.Sequential(
+                nn.Linear(64*7*7, 128),
+                nn.BatchNorm1d(128),
+                nn.ReLU())
+            self.fc2 = nn.Linear(128, 10)
+            self.fc_size = 64*7*7
+        """
+        self._reset_prams()
+
+    def _reset_prams(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_uniform_(m.weight, mode='fan_in', nonlinearity='relu')
+            elif isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+        return
+
+    def forward(self, x):
+        x = self.block1(x)
+        x = self.block2(x)
+        x = self.block3(x) if self.type == 'CIFAR10' else x
+        # x = self.global_avg_pool(x)
+        # x = x.view(x.shape[0], -1)
+        x = x.view(-1, self.fc_size)
+        x_fc1 = self.fc1(x)
+        x = self.fc2(x_fc1)
+        return x, x_fc1
+
+
+'''ResNet in PyTorch.
+For Pre-activation ResNet, see 'preact_resnet.py'.
+Reference:
+[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
+    Deep Residual Learning for Image Recognition. arXiv:1512.03385
+'''
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, in_planes, planes, stride=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion*planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(self.expansion*planes)
+            )
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))
+        out += self.shortcut(x)
+        out = F.relu(out)
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, in_planes, planes, stride=1):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(self.expansion*planes)
+
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion*planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(self.expansion*planes)
+            )
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = F.relu(self.bn2(self.conv2(out)))
+        out = self.bn3(self.conv3(out))
+        out += self.shortcut(x)
+        out = F.relu(out)
+        return out
+
+
+class ResNet(nn.Module):
+    def __init__(self, block, num_blocks, num_classes=10):
+        super(ResNet, self).__init__()
+        self.in_planes = 64
+
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
+        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
+        self.linear = nn.Linear(512*block.expansion, num_classes)
+        self._reset_prams()
+
+    def _make_layer(self, block, planes, num_blocks, stride):
+        strides = [stride] + [1]*(num_blocks-1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.in_planes, planes, stride))
+            self.in_planes = planes * block.expansion
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = self.layer4(out)
+        out = F.avg_pool2d(out, 4)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+    def _reset_prams(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_uniform_(m.weight, mode='fan_in', nonlinearity='relu')
+            elif isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+        return
+
+
+@mlconfig.register
+def ResNet18(num_classes=10):
+    return ResNet(BasicBlock, [2, 2, 2, 2], num_classes=num_classes)
+
+
+@mlconfig.register
+def ResNet34(num_classes=10):
+    return ResNet(BasicBlock, [3, 4, 6, 3], num_classes=num_classes)
+
+
+@mlconfig.register
+def ResNet50(num_classes=10):
+    return ResNet(Bottleneck, [3, 4, 6, 3], num_classes=num_classes)
+
+
+@mlconfig.register
+def ResNet101(num_classes=10):
+    return ResNet(Bottleneck, [3, 4, 23, 3], num_classes=num_classes)
+
+
+@mlconfig.register
+def ResNet152(num_classes=10):
+    return ResNet(Bottleneck, [3, 8, 36, 3], num_classes=num_classes)
diff --git a/plot.py b/plot.py
new file mode 100644
index 0000000..eafd327
--- /dev/null
+++ b/plot.py
@@ -0,0 +1,214 @@
+import os
+import numpy as np
+import matplotlib.pyplot as plt
+from lid import lid_mle
+import torch
+import util
+
+np.random.seed(1024)
+
+MODELS = ['ce', 'fl', 'bl', 'bsl', 'bhl', 'd2l']
+MODEL_LABELS = ['cross-entropy', 'forward', 'backward', 'boot-soft', 'boot-hard', 'D2L']
+COLORS = ['r', 'y', 'c', 'm', 'g', 'b']
+MARKERS = ['x', 'D', '<', '>', '^', 'o']
+
+
+def lid_trend_through_training(exp_name, dataset, data_loader, device, model, optimizer, scheduler, model_name='d2l', noise_type='sym', noise_ratio=0.):
+    """
+    plot the lid trend for clean vs noisy samples through training.
+    This can provide some information about manifold learning dynamics through training.
+    """
+
+    lids, train_accs, test_accs = None, None, None
+
+    # get LID of raw inputs
+    k = 20
+    lids = []
+    for j, (images,labels) in enumerate(data_loader['train_dataset']):
+        images = images.to(device, non_blocking = True)
+        lids.extend(lid_mle(images, images, k=k))
+        
+    lids = torch.stack(lids, dim=0).type(torch.float32)
+    lid_X = lids.mean()
+    print('LID of input X: ', lid_X)
+    
+    exp_path = os.path.join(exp_name, model_name)
+    checkpoint_path = os.path.join(exp_path, 'checkpoints')
+    checkpoint_path_file = os.path.join(checkpoint_path, model_name)
+    checkpoint = util.load_model(filename=checkpoint_path_file,
+                                 model=model,
+                                 optimizer=optimizer,
+                                 scheduler=scheduler)
+    ENV = checkpoint['ENV']
+    train_accs = ENV['train_history']
+    train_accs.insert(0,0)
+    test_accs = ENV['eval_history']
+    test_accs.insert(0,0)
+    lids = ENV['lid']
+    lids.insert(0,lid_X)
+    lids = torch.stack(lids, dim=0).type(torch.float32)
+
+    plot(dataset, model_name, noise_ratio, lids, train_accs, test_accs)
+
+
+def plot(dataset, model_name, noise_ratio, lids, train_accs, test_accs):
+    """
+    plot function
+    """
+    # plot
+    fig = plt.figure()  # figsize=(7, 6)
+    xnew = np.arange(0, len(lids), 5)
+
+    lids = lids.cpu().numpy()
+    train_accs = np.array(train_accs) / 100
+    test_accs = np.array(test_accs) / 100
+    print(train_accs)
+    lids = lids[xnew]
+    train_accs = train_accs[xnew]
+    test_accs = test_accs[xnew]
+
+    ax = fig.add_subplot(111)
+    ax.plot(xnew, lids, c='r', marker='o', markersize=3, linewidth=2, label='LID score')
+
+    ax2 = ax.twinx()
+    ax2.plot(xnew, train_accs, c='b', marker='x', markersize=3, linewidth=2, label='Train acc')
+    ax2.plot(xnew, test_accs, c='c', marker='^', markersize=3, linewidth=2, label='Test acc')
+
+    # ax.set_xticks([])
+    # ax.set_yticks([])
+    ax.set_xlabel("Epoch", fontsize=15)
+    ax.set_ylabel("Subspace dimensionality (LID score)", fontsize=15)
+    ax2.set_ylabel("Train/test accuracy", fontsize=15)
+    # ax.set_title("%s with %s%% noisy labels" % (dataset.upper(), noise_ratio), fontsize=15)
+
+    if dataset == 'mnist':
+        ax.set_ylim((4, 22))  # for mnist
+        ax2.set_ylim((0.2, 1.2))
+    elif dataset == 'svhn':
+        ax.set_ylim((7, 20)) # for svhn
+        ax2.set_ylim((0.2, 1.2))
+    elif dataset == 'cifar10':
+        ax.set_ylim((2.5, 12.5))  # for cifar-10
+        #ax.set_ylim((3.5, 20.5))
+        ax2.set_ylim((0., 1.2))
+    elif dataset == 'cifar100':
+        ax.set_ylim((3, 12))  # for cifar-100
+        ax2.set_ylim((0., 1.))
+
+    legend = ax.legend(loc='upper left')
+    plt.setp(legend.get_texts(), fontsize=15)
+    legend2 = ax2.legend(loc='upper right')
+    plt.setp(legend2.get_texts(), fontsize=15)
+    fig.savefig("plots/lid_trend_%s_%s_%s.png" % (model_name, dataset, noise_ratio), dpi=300)
+    plt.show()
+
+
+def lid_trend_of_learning_models(exp_name, dataset, model, optimizer, scheduler, model_list=['ce'], noise_ratio=0):
+    """
+    The LID trend of different learning models throughout.
+    """
+    # plot initialization
+    fig = plt.figure()  # figsize=(7, 6)
+    ax = fig.add_subplot(111)
+
+    for model_name in model_list:
+        exp_path = os.path.join(exp_name, model_name)
+        checkpoint_path = os.path.join(exp_path, 'checkpoints')
+        checkpoint_path_file = os.path.join(checkpoint_path, model_name)
+        checkpoint = util.load_model(filename=checkpoint_path_file,
+                                     model=model,
+                                     optimizer=optimizer,
+                                     scheduler=scheduler)
+        ENV = checkpoint['ENV']
+        lids = ENV['lid']
+        lids = torch.stack(lids, dim=0).type(torch.float32)
+        lids = lids.cpu().numpy()
+            # smooth for plot
+        lids[lids < 0] = 0
+        lids[lids > 10] = 10
+
+        xnew = np.arange(0, len(lids), 5)
+        lids = lids[xnew]
+
+        # plot line
+        idx = MODELS.index(model_name)
+        ax.plot(xnew, lids, c=COLORS[idx], marker=MARKERS[idx], markersize=3, linewidth=2, label=MODEL_LABELS[idx])
+
+    ax.set_xlabel("Epoch", fontsize=15)
+    ax.set_ylabel("Subspace dimensionality (LID score)", fontsize=15)
+    # ax.set_title("%s with %s%% noisy labels" % (dataset.upper(), noise_ratio), fontsize=15)
+    legend = plt.legend(loc='lower center', ncol=2)
+    plt.setp(legend.get_texts(), fontsize=15)
+    fig.savefig("plots/lid_trend_all_models_%s_%s_%s.png" % (exp_name, dataset, noise_ratio), dpi=300)
+    plt.show()
+
+def test_acc_trend_of_learning_models(exp_name, dataset, model, optimizer, scheduler, model_list=['ce'], noise_ratio=0):
+    """
+    The test_acc trend of different learning models throughout.
+    """
+    # plot initialization
+    fig = plt.figure()  # figsize=(7, 6)
+    ax = fig.add_subplot(111)
+
+    for model_name in model_list:
+        exp_path = os.path.join(exp_name, model_name)
+        checkpoint_path = os.path.join(exp_path, 'checkpoints')
+        checkpoint_path_file = os.path.join(checkpoint_path, model_name)
+        checkpoint = util.load_model(filename=checkpoint_path_file,
+                                     model=model,
+                                     optimizer=optimizer,
+                                     scheduler=scheduler)
+        ENV = checkpoint['ENV']
+        test_accs = ENV['eval_history']
+        test_accs = np.array(test_accs) / 100
+
+        xnew = np.arange(0, len(test_accs), 5)
+        test_accs = test_accs[xnew]
+
+        # plot line
+        idx = MODELS.index(model_name)
+        ax.plot(xnew, test_accs, c=COLORS[idx], marker=MARKERS[idx], markersize=3, linewidth=2, label=MODEL_LABELS[idx])
+
+    ax.set_xlabel("Epoch", fontsize=15)
+    ax.set_ylabel("Test Accuracy", fontsize=15)
+    # ax.set_title("%s with %s%% noisy labels" % (dataset.upper(), noise_ratio), fontsize=15)
+    legend = plt.legend(loc='lower center', ncol=2)
+    plt.setp(legend.get_texts(), fontsize=15)
+    fig.savefig("plots/test_accs_trend_all_models_%s_%s_%s.png" % (exp_name, dataset, noise_ratio), dpi=300)
+    plt.show()
+    
+def csr_trend_of_learning_models(exp_name, dataset, model, optimizer, scheduler, model_list=['ce'], noise_ratio=0):
+    """
+    The CSR trend of different learning models throughout.
+    """
+    # plot initialization
+    fig = plt.figure()  # figsize=(7, 6)
+    ax = fig.add_subplot(111)
+
+    for model_name in model_list:
+        exp_path = os.path.join(exp_name, model_name)
+        checkpoint_path = os.path.join(exp_path, 'checkpoints')
+        checkpoint_path_file = os.path.join(checkpoint_path, model_name)
+        checkpoint = util.load_model(filename=checkpoint_path_file,
+                                     model=model,
+                                     optimizer=optimizer,
+                                     scheduler=scheduler)
+        ENV = checkpoint['ENV']
+        csr = ENV['csr']
+        csr = torch.stack(csr, dim=0).type(torch.float32)
+        csr = csr.cpu().numpy()
+
+        xnew = np.arange(0, len(csr), 5)
+        csr = csr[xnew]
+
+        # plot line
+        idx = MODELS.index(model_name)
+        ax.plot(xnew, csr, c=COLORS[idx], marker=MARKERS[idx], markersize=3, linewidth=2, label=MODEL_LABELS[idx])
+
+    ax.set_xlabel("Epoch", fontsize=15)
+    ax.set_ylabel("CRS", fontsize=15)
+    # ax.set_title("%s with %s%% noisy labels" % (dataset.upper(), noise_ratio), fontsize=15)
+    legend = plt.legend(loc='lower center', ncol=2)
+    plt.setp(legend.get_texts(), fontsize=15)
+    fig.savefig("plots/crs_trend_all_models_%s_%s_%s.png" % (exp_name, dataset, noise_ratio), dpi=300)
+    plt.show()
\ No newline at end of file
diff --git a/representation_plot.py b/representation_plot.py
deleted file mode 100644
index cf2b242..0000000
--- a/representation_plot.py
+++ /dev/null
@@ -1,154 +0,0 @@
-"""
-Date: 28/07/2017
-feature exploration and visualization
-
-Author: Xingjun Ma
-"""
-import os
-import numpy as np
-import matplotlib.pyplot as plt
-import matplotlib.gridspec as gridspec
-from sklearn.manifold import TSNE
-from keras.optimizers import SGD
-from util import get_deep_representations
-from datasets import get_data
-from models import get_model
-from loss import cross_entropy
-
-np.random.seed(1234)
-
-CLASSES = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
-
-def feature_visualization(model_name='ce', dataset='mnist',
-                          num_classes=10, noise_ratio=40, n_samples=100):
-    """
-    This is to show how features of incorretly labeled images are overffited to the wrong class.
-    plot t-SNE 2D-projected deep features (right before logits).
-    This will generate 3 plots in a grid (3x1). 
-    The first shows the raw features projections of two classes of images (clean label + noisy label)
-    The second shows the deep features learned by cross-entropy after training.
-    The third shows the deep features learned using a new loss after training.
-    
-    :param model_name: a new model other than crossentropy(ce), can be: boot_hard, boot_soft, forward, backward, lid
-    :param dataset: 
-    :param num_classes:
-    :param noise_type；
-    :param noise_ratio: 
-    :param epochs: to find the last epoch
-    :param n_samples: 
-    :return: 
-    """
-    print('Dataset: %s, model_name: ce/%s, noise ratio: %s%%' % (model_name, dataset, noise_ratio))
-    features_ce = np.array([None, None])
-    features_other = np.array([None, None])
-
-    # # load pre-saved to avoid recomputing
-    # feature_tmp = "lof/representation_%s_%s.npy" % (dataset, noise_ratio)
-    # if os.path.isfile(feature_tmp):
-    #     data = np.load(feature_tmp)
-    #     features_input = data[0]
-    #     features_ce = data[1]
-    #     features_other = data[2]
-    #
-    #     plot(model_name, dataset, noise_ratio, features_input, features_ce, features_other)
-    #     return
-
-    # load data
-    X_train, Y_train, X_test, Y_test = get_data(dataset)
-    Y_noisy = np.load("data/noisy_label_%s_%s.npy" % (dataset, noise_ratio))
-    Y_noisy = Y_noisy.reshape(-1)
-
-    # sample training set
-    cls_a = 0
-    cls_b = 3
-
-    # find smaples labeled to class A and B
-    cls_a_idx = np.where(Y_noisy == cls_a)[0]
-    cls_b_idx = np.where(Y_noisy == cls_b)[0]
-
-    # sampling for efficiency purpose
-    cls_a_idx = np.random.choice(cls_a_idx, n_samples, replace=False)
-    cls_b_idx = np.random.choice(cls_b_idx, n_samples, replace=False)
-
-    X_a = X_train[cls_a_idx]
-    X_b = X_train[cls_b_idx]
-
-    image_shape = X_train.shape[1:]
-    model = get_model(dataset, input_tensor=None, input_shape=image_shape)
-    sgd = SGD(lr=0.01, momentum=0.9)
-
-
-    #### get deep representations of ce model
-    model_path = 'model/ce_%s_%s.hdf5' % (dataset, noise_ratio)
-    model.load_weights(model_path)
-    model.compile(
-        loss=cross_entropy,
-        optimizer=sgd,
-        metrics=['accuracy']
-    )
-
-    rep_a = get_deep_representations(model, X_a, batch_size=100).reshape((X_a.shape[0], -1))
-    rep_b = get_deep_representations(model, X_b, batch_size=100).reshape((X_b.shape[0], -1))
-
-    rep_a = TSNE(n_components=2).fit_transform(rep_a)
-    rep_b = TSNE(n_components=2).fit_transform(rep_b)
-    features_ce[0] = rep_a
-    features_ce[1] = rep_b
-
-    #### get deep representations of other model
-    model_path = 'model/%s_%s_%s.hdf5' % (model_name, dataset, noise_ratio)
-    model.load_weights(model_path)
-    model.compile(
-        loss=cross_entropy,
-        optimizer=sgd,
-        metrics=['accuracy']
-    )
-
-    rep_a = get_deep_representations(model, X_a, batch_size=100).reshape((X_a.shape[0], -1))
-    rep_b = get_deep_representations(model, X_b, batch_size=100).reshape((X_b.shape[0], -1))
-
-    rep_a = TSNE(n_components=2).fit_transform(rep_a)
-    rep_b = TSNE(n_components=2).fit_transform(rep_b)
-    features_other[0] = rep_a
-    features_other[1] = rep_b
-
-    # plot
-    fig = plt.figure(figsize=(12, 5))
-    gs = gridspec.GridSpec(1, 2, wspace=0.15)
-
-    a_clean_idx = Y_train[cls_a_idx] == Y_noisy[cls_a_idx]
-    a_noisy_idx = Y_train[cls_a_idx] != Y_noisy[cls_a_idx]
-    b_clean_idx = Y_train[cls_b_idx] == Y_noisy[cls_b_idx]
-    b_noisy_idx = Y_train[cls_b_idx] != Y_noisy[cls_b_idx]
-
-    ## plot features learned by cross-entropy
-    ax = fig.add_subplot(gs[0, 0])
-    A = features_ce[0]
-    B = features_ce[1]
-    # clean labeld class A samples plot
-    ax.scatter(A[a_clean_idx][:, 0].ravel(), A[a_clean_idx][:, 1].ravel(), c='b', marker='o', s=10, label='class A: clean')
-    ax.scatter(A[a_noisy_idx][:, 0].ravel(), A[a_noisy_idx][:, 1].ravel(), c='m', marker='x', s=30, label='class A: noisy')
-    ax.scatter(B[b_clean_idx][:, 0].ravel(), B[b_clean_idx][:, 1].ravel(), c='r', marker='o', s=10, label='class B: clean')
-    ax.scatter(B[b_noisy_idx][:, 0].ravel(), B[b_noisy_idx][:, 1].ravel(), c='c', marker='x', s=30, label='class B: noisy')
-
-    ax.set_title("cross-entropy", fontsize=15)
-    legend = ax.legend(loc='lower center', ncol=2)
-    plt.setp(legend.get_texts(), fontsize=15)
-
-    ax = fig.add_subplot(gs[0, 1])
-    A = features_other[0]
-    B = features_other[1]
-    ax.scatter(A[a_clean_idx][:, 0].ravel(), A[a_clean_idx][:, 1].ravel(), c='b', marker='o', s=10, label='class A: clean')
-    ax.scatter(A[a_noisy_idx][:, 0].ravel(), A[a_noisy_idx][:, 1].ravel(), c='m', marker='x', s=30, label='class A: noisy')
-    ax.scatter(B[b_clean_idx][:, 0].ravel(), B[b_clean_idx][:, 1].ravel()-5, c='r', marker='o', s=10, label='class B: clean')
-    ax.scatter(B[b_noisy_idx][:, 0].ravel(), B[b_noisy_idx][:, 1].ravel(), c='c', marker='x', s=30, label='class B: noisy')
-
-    ax.set_title("D2L", fontsize=15)
-    legend = ax.legend(loc='lower center', ncol=2)
-    plt.setp(legend.get_texts(), fontsize=15)
-
-    fig.savefig("plots/representations_%s_%s_%s.png" % (model_name, dataset, noise_ratio), dpi=300)
-    plt.show()
-
-if __name__ == "__main__":
-    feature_visualization(model_name='d2l', dataset='cifar-10', num_classes=10, noise_ratio=60, n_samples=500)
\ No newline at end of file
diff --git a/resnet.py b/resnet.py
deleted file mode 100644
index 665e09b..0000000
--- a/resnet.py
+++ /dev/null
@@ -1,122 +0,0 @@
-"""Some code sections are taken from
-https://github.com/raghakot/keras-resnet
-"""
-
-import sys
-
-import numpy as np
-
-from keras.models import Model
-from keras.layers import Input, Activation, merge, Dense, Flatten
-from keras.layers.convolutional import Conv2D, MaxPooling2D, ZeroPadding2D
-from keras.layers.convolutional import AveragePooling2D
-from keras.layers.normalization import BatchNormalization
-from keras.regularizers import l2
-from keras.layers.merge import add
-from keras import backend as K
-
-sys.setrecursionlimit(10000)
-
-BN_AXIS = 3
-
-
-def cifar100_resnet(depth, num_classes):
-    # how many layers this is going to create?
-    # 2 + 6 * depth
-
-    img_channels = 3
-    img_rows = 32
-    img_cols = 32
-    num_conv = 3
-    decay = 2e-3
-
-    input = Input(shape=(img_rows, img_cols, img_channels))
-
-    # 1 conv + BN + relu
-    filters = 16
-    b = Conv2D(filters=filters, kernel_size=(num_conv, num_conv),
-               kernel_initializer="he_normal", padding="same",
-               kernel_regularizer=l2(decay), bias_regularizer=l2(0))(input)
-    b = BatchNormalization(axis=BN_AXIS)(b)
-    b = Activation("relu")(b)
-
-    # 1 res, no striding
-    b = residual(num_conv, filters, decay, first=True)(b)  # 2 layers inside
-    for _ in np.arange(1, depth):  # start from 1 => 2 * depth in total
-        b = residual(num_conv, filters, decay)(b)
-
-    filters *= 2
-
-    # 2 res, with striding
-    b = residual(num_conv, filters, decay, more_filters=True)(b)
-    for _ in np.arange(1, depth):
-        b = residual(num_conv, filters, decay)(b)
-
-    filters *= 2
-
-    # 3 res, with striding
-    b = residual(num_conv, filters, decay, more_filters=True)(b)
-    for _ in np.arange(1, depth):
-        b = residual(num_conv, filters, decay)(b)
-
-    b = BatchNormalization(axis=BN_AXIS)(b)
-    b = Activation("relu")(b)
-
-    b = AveragePooling2D(pool_size=(8, 8), strides=(1, 1),
-                         padding="valid")(b)
-
-    out = Flatten(name='lid')(b)
-
-    dense = Dense(units=num_classes, kernel_initializer="he_normal",
-                  kernel_regularizer=l2(decay), bias_regularizer=l2(0))(out)
-
-    act = Activation("softmax")(dense)
-
-    return Model(inputs=input, outputs=act)
-
-
-def residual(num_conv, filters, decay, more_filters=False, first=False):
-    def f(input):
-        # in_channel = input._keras_shape[1]
-        out_channel = filters
-
-        if more_filters and not first:
-            # out_channel = in_channel * 2
-            stride = 2
-        else:
-            # out_channel = in_channel
-            stride = 1
-
-        if not first:
-            b = BatchNormalization(axis=BN_AXIS)(input)
-            b = Activation("relu")(b)
-        else:
-            b = input
-
-        b = Conv2D(filters=out_channel,
-                   kernel_size=(num_conv, num_conv),
-                   strides=(stride, stride),
-                   kernel_initializer="he_normal", padding="same",
-                   kernel_regularizer=l2(decay), bias_regularizer=l2(0))(b)
-        b = BatchNormalization(axis=BN_AXIS)(b)
-        b = Activation("relu")(b)
-        res = Conv2D(filters=out_channel,
-                     kernel_size=(num_conv, num_conv),
-                     kernel_initializer="he_normal", padding="same",
-                     kernel_regularizer=l2(decay), bias_regularizer=l2(0))(b)
-
-        # check and match number of filter for the shortcut
-        input_shape = K.int_shape(input)
-        residual_shape = K.int_shape(res)
-        if not input_shape[3] == residual_shape[3]:
-            stride_width = int(round(input_shape[1] / residual_shape[1]))
-            stride_height = int(round(input_shape[2] / residual_shape[2]))
-
-            input = Conv2D(filters=residual_shape[3], kernel_size=(1, 1),
-                           strides=(stride_width, stride_height),
-                           kernel_initializer="he_normal",
-                           padding="valid", kernel_regularizer=l2(decay))(input)
-
-        return add([input, res])
-
-    return f
\ No newline at end of file
diff --git a/script/CIFAR10.slurm b/script/CIFAR10.slurm
new file mode 100644
index 0000000..8fc12ca
--- /dev/null
+++ b/script/CIFAR10.slurm
@@ -0,0 +1,61 @@
+#!/bin/bash
+#SBATCH --nodes 1
+#SBATCH --partition gpgpu
+#SBATCH --gres=gpu:1
+
+# The project ID which this job should run under:
+#SBATCH --account="punim0784"
+
+# Maximum number of tasks/CPU cores used by the job:
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=8
+
+# The amount of memory in megabytes per process in the job:
+#SBATCH --mem=64G
+
+# The maximum running time of the job in days-hours:mins:sec
+#SBATCH --time 96:00:00
+
+# check that the script is launched with sbatch
+if [ "x$SLURM_JOB_ID" == "x" ]; then
+   echo "You need to submit your job to the queuing system with sbatch"
+   exit 1
+fi
+
+
+# Run the job from this directory:
+cd /data/cephfs/punim0784/robust_loss_nips
+
+# The modules to load:
+module load Python/3.6.4-intel-2017.u2-GCC-6.2.0-CUDA10
+nvidia-smi
+
+exp_name=$1
+seed=$2
+loss=$3
+
+# Sym
+declare -a nr_arr=("0.0"
+                   "0.2"
+                   "0.4"
+                   "0.6"
+                   "0.8")
+
+for i in "${nr_arr[@]}"
+    do
+    rm -rf ${exp_name}/cifar10/sym/$i/${loss}/*
+    python3 -u main.py --exp_name ${exp_name}/cifar10/sym/$i --seed $seed --noise_rate $i --config_path configs/cifar10/sym --version ${loss}
+done
+
+# Asym
+declare -a nr_arr=(
+                   "0.1"
+                   "0.2"
+                   "0.3"
+                   "0.4"
+                  )
+for i in "${nr_arr[@]}"
+    do
+      rm -rf ${exp_name}/cifar10/asym/$i/${loss}/*
+      python3 -u main.py --exp_name ${exp_name}/cifar10/asym/$i --seed $seed --noise_rate $i --config_path configs/cifar10/asym --version ${loss}
+done
diff --git a/script/CIFAR100.slurm b/script/CIFAR100.slurm
new file mode 100644
index 0000000..8b36be6
--- /dev/null
+++ b/script/CIFAR100.slurm
@@ -0,0 +1,63 @@
+#!/bin/bash
+#SBATCH --nodes 1
+#SBATCH --partition gpgpu
+#SBATCH --gres=gpu:1
+
+# The project ID which this job should run under:
+#SBATCH --account="punim0784"
+
+# Maximum number of tasks/CPU cores used by the job:
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=8
+
+# The amount of memory in megabytes per process in the job:
+#SBATCH --mem=64G
+
+# The maximum running time of the job in days-hours:mins:sec
+#SBATCH --time 168:00:00
+
+# check that the script is launched with sbatch
+if [ "x$SLURM_JOB_ID" == "x" ]; then
+   echo "You need to submit your job to the queuing system with sbatch"
+   exit 1
+fi
+
+
+# Run the job from this directory:
+cd /data/cephfs/punim0784/robust_loss_nips
+
+
+# The modules to load:
+module load Python/3.6.4-intel-2017.u2-GCC-6.2.0-CUDA10
+nvidia-smi
+
+exp_name=$1
+seed=$2
+loss=$3
+
+
+# Sym
+declare -a nr_arr=("0.0"
+                   "0.2"
+                   "0.4"
+                   "0.6"
+                   "0.8")
+
+for i in "${nr_arr[@]}"
+    do
+    rm -rf ${exp_name}/cifar100/sym/$i/${loss}/*
+    python3 -u main.py --exp_name ${exp_name}/cifar100/sym/$i --seed $seed --noise_rate $i --config_path configs/cifar100/sym --version ${loss}
+done
+
+# Asym
+declare -a nr_arr=(
+                   "0.1"
+                   "0.2"
+                   "0.3"
+                   "0.4"
+                  )
+for i in "${nr_arr[@]}"
+    do
+      rm -rf ${exp_name}/cifar100/asym/$i/${loss}/*
+      python3 -u main.py --exp_name ${exp_name}/cifar100/asym/$i --seed $seed --noise_rate $i --config_path configs/cifar100/asym --version ${loss} --asym
+done
diff --git a/script/MNIST.slurm b/script/MNIST.slurm
new file mode 100644
index 0000000..f6d35c8
--- /dev/null
+++ b/script/MNIST.slurm
@@ -0,0 +1,63 @@
+#!/bin/bash
+#SBATCH --nodes 1
+#SBATCH --partition gpgpu
+#SBATCH --gres=gpu:1
+
+# The project ID which this job should run under:
+#SBATCH --account="punim0784"
+
+# Maximum number of tasks/CPU cores used by the job:
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=8
+
+# The amount of memory in megabytes per process in the job:
+#SBATCH --mem=64G
+
+# The maximum running time of the job in days-hours:mins:sec
+#SBATCH --time 48:00:00
+
+# check that the script is launched with sbatch
+if [ "x$SLURM_JOB_ID" == "x" ]; then
+   echo "You need to submit your job to the queuing system with sbatch"
+   exit 1
+fi
+
+
+# Run the job from this directory:
+# cd /data/gpfs/users/hanxunh/robust_loss_nips
+cd /data/cephfs/punim0784/robust_loss_nips
+
+# The modules to load:
+module load Python/3.6.4-intel-2017.u2-GCC-6.2.0-CUDA10
+nvidia-smi
+
+exp_name=$1
+seed=$2
+loss=$3
+
+
+# Sym
+declare -a nr_arr=("0.0"
+                   "0.2"
+                   "0.4"
+                   "0.6"
+                   "0.8")
+
+for i in "${nr_arr[@]}"
+    do
+    rm -rf ${exp_name}/mnist/sym/$i/${loss}/*
+    python3 -u main.py --exp_name ${exp_name}/mnist/sym/$i --seed $seed --noise_rate $i --config_path configs/mnist/sym --version ${loss}
+done
+
+# Asym
+declare -a nr_arr=(
+                   "0.1"
+                   "0.2"
+                   "0.3"
+                   "0.4"
+                  )
+for i in "${nr_arr[@]}"
+    do
+      rm -rf ${exp_name}/mnist/asym/$i/${loss}/*
+      python3 -u main.py --exp_name ${exp_name}/mnist/asym/$i --seed $seed --noise_rate $i --config_path configs/mnist/asym --version ${loss} --asym
+done
diff --git a/script/WebVisionMini.slurm b/script/WebVisionMini.slurm
new file mode 100644
index 0000000..3f06a08
--- /dev/null
+++ b/script/WebVisionMini.slurm
@@ -0,0 +1,60 @@
+#!/bin/bash
+#SBATCH --nodes 1
+#SBATCH --partition gpgpu
+#SBATCH --gres=gpu:4
+
+# The project ID which this job should run under:
+#SBATCH --account="punim0784"
+
+# Maximum number of tasks/CPU cores used by the job:
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=24
+
+# The amount of memory in megabytes per process in the job:
+#SBATCH --mem=120G
+
+# The maximum running time of the job in days-hours:mins:sec
+#SBATCH --time 168:00:00
+
+# check that the script is launched with sbatch
+if [ "x$SLURM_JOB_ID" == "x" ]; then
+   echo "You need to submit your job to the queuing system with sbatch"
+   exit 1
+fi
+
+
+# Copy Data to local node
+cd /var/local/tmp/
+mkdir -p datasets
+mkdir -p datasets/ILSVR2012
+
+pwd
+echo 'rsync datasets'
+
+rsync -avzh --progress /data/cephfs/punim0784/datasets/google_resized_256.tar                       datasets/
+rsync -avzh --progress /data/cephfs/punim0784/datasets/webvision_mini_train.txt                     datasets/
+rsync -avzh --progress /data/cephfs/punim0784/datasets/train_filelist_google.txt                    datasets/
+
+rsync -avzh --progress /data/cephfs/punim0784/datasets/ILSVR2012/ILSVRC2012_img_val.tar             datasets/ILSVR2012/
+rsync -avzh --progress /data/cephfs/punim0784/datasets/ILSVR2012/meta.bin                           datasets/ILSVR2012/
+rsync -avzh --progress /data/cephfs/punim0784/datasets/ILSVR2012/ILSVRC2012_devkit_t12.tar.gz       datasets/ILSVR2012/
+
+cd datasets
+pwd
+echo 'untar google_resized_256'
+tar -xvf google_resized_256.tar
+
+# Run the job from this directory:
+cd /data/cephfs/punim0784/robust_loss_nips
+
+# The modules to load:
+module load Python/3.6.4-intel-2017.u2-GCC-6.2.0-CUDA10
+nvidia-smi
+
+exp_name=$1
+seed=$2
+loss=$3
+
+rm -rf ${exp_name}/web_vision_mini/${loss}/*
+rm -rf ${exp_name}/web_vision_mini/${loss}
+python3 -u main.py --data_parallel --exp_name ${exp_name}/webvision_mini/ --seed $seed --config_path configs/webvision_mini --version ${loss}
diff --git a/script/script.sh b/script/script.sh
new file mode 100644
index 0000000..b4e1566
--- /dev/null
+++ b/script/script.sh
@@ -0,0 +1,91 @@
+#
+# # CIFAR100
+# declare -a loss=( "ce"
+#                   "focal"
+#                   "gce"
+#                   "mae"
+#                   "nce"
+#                   "nce+mae"
+#                   "nce+rce"
+#                   "nfl"
+#                   "nfl+mae"
+#                   "nfl+rce"
+#                   "ngce"
+#                   "ngce+mae"
+#                   "ngce+rce"
+#                   "nlnl"
+#                   "rce"
+#                   "sce" )
+#
+# declare -a run_version=(
+#                         "run1"
+#                         "run2"
+#                         "run3"
+#                        )
+#
+# seed=0
+# for i in "${run_version[@]}"
+# do
+#   for j in "${loss[@]}"
+#   do
+#     job_name=C100_${i}_${j}
+#     echo $job_name
+#     sbatch --partition gpgputest --qos=gpgpuhpcadmingpgpu --job-name $job_name --cpus-per-task=8 --gres=gpu:1 CIFAR100.slurm $i $seed $j
+#   done
+#   seed=$((seed+1))
+# done
+
+
+# # WebVision Full
+# declare -a loss=(
+#                   "ce"
+#                   "gce"
+#                   "nce+mae"
+#                   "nce+rce"
+#                   "nfl+mae"
+#                   "nfl+rce"
+#                   "sce"
+#                 )
+#
+# declare -a run_version=(
+#                          "webvision_full"
+#                        )
+#
+# seed=0
+# for i in "${run_version[@]}"
+# do
+#   for j in "${loss[@]}"
+#   do
+#     job_name=WebVisionFull_${i}_${j}
+#     echo $job_name
+#     sbatch --partition gpgputest --qos=gpgpuhpcadmingpgpu --job-name $job_name --cpus-per-task=12 --gres=gpu:4 WebVisionFull.slurm $i $seed $j
+#   done
+#   seed=$((seed+1))
+# done
+
+# # WebVision Mini
+# declare -a loss=(
+#                   "ce"
+#                   "gce"
+#                   "nce+mae"
+#                   "nce+rce"
+#                   "nfl+mae"
+#                   "nfl+rce"
+#                   "sce"
+#                   )
+#
+# declare -a run_version=(
+#                         "webvision_mini"
+#                         )
+#
+# seed=0
+# for i in "${run_version[@]}"
+# do
+#   for j in "${loss[@]}"
+#   do
+#     job_name=WebVisionMini${i}_${j}
+#     echo $job_name
+#     sbatch --partition gpgputest --qos=gpgpuhpcadmingpgpu --job-name $job_name --cpus-per-task=24 --gres=gpu:4 WebVisionMini.slurm $i $seed $j
+#   done
+#   seed=$((seed+1))
+# done
diff --git a/script/submit_c10.sh b/script/submit_c10.sh
new file mode 100644
index 0000000..249647a
--- /dev/null
+++ b/script/submit_c10.sh
@@ -0,0 +1,39 @@
+# CIFAR10
+declare -a loss=(
+                  "ce"
+                  "focal"
+                  "gce"
+                  "mae"
+                  "nce"
+                  "nce+mae"
+                  "nce+rce"
+                  "nfl"
+                  "nfl+mae"
+                  "nfl+rce"
+                  "ngce"
+                  "ngce+mae"
+                  "ngce+rce"
+                  # "nlnl"
+                  "rce"
+                  "sce"
+                )
+
+declare -a run_version=(
+                        "run1"
+                        "run2"
+                        "run3"
+                       )
+
+seed=0
+for i in "${run_version[@]}"
+do
+  for j in "${loss[@]}"
+  do
+    echo C10_${i}_${j}
+    job_name=${j}_C10_${i}
+    # sbatch --partition gpgputest --qos=gpgpuhpcadmingpgpu --job-name $job_name --cpus-per-task=8 --gres=gpu:1 CIFAR10.slurm $i $seed $j
+    sbatch --partition gpgpu --job-name $job_name --cpus-per-task=4 --gres=gpu:1 --mem=32G CIFAR10.slurm $i $seed $j
+    # sbatch --partition deeplearn --qos gpgpudeeplearn --job-name $job_name --cpus-per-task=4 --gres=gpu:1 --mem=32G CIFAR10.slurm $i $seed $j
+  done
+  seed=$((seed+1))
+done
diff --git a/script/submit_c100.sh b/script/submit_c100.sh
new file mode 100644
index 0000000..6288727
--- /dev/null
+++ b/script/submit_c100.sh
@@ -0,0 +1,39 @@
+# CIFAR10
+declare -a loss=(
+                  "ce"
+                  "focal"
+                  "gce"
+                  "mae"
+                  "nce"
+                  "nce+mae"
+                  "nce+rce"
+                  "nfl"
+                  "nfl+mae"
+                  "nfl+rce"
+                  "ngce"
+                  "ngce+mae"
+                  "ngce+rce"
+                  # "nlnl"
+                  "rce"
+                  "sce"
+                )
+
+declare -a run_version=(
+                        "run1"
+                        "run2"
+                        "run3"
+                       )
+
+seed=0
+for i in "${run_version[@]}"
+do
+  for j in "${loss[@]}"
+  do
+    echo C100_${i}_${j}
+    job_name=${j}_C100_${i}
+    # sbatch --partition gpgputest --qos=gpgpuhpcadmingpgpu --job-name $job_name --cpus-per-task=8 --gres=gpu:1 CIFAR10.slurm $i $seed $j
+    sbatch --partition gpgpu --job-name $job_name --cpus-per-task=4 --gres=gpu:1 --mem=32G CIFAR100.slurm $i $seed $j
+    # sbatch --partition deeplearn --qos gpgpudeeplearn --job-name $job_name --cpus-per-task=4 --gres=gpu:1 --mem=32G CIFAR10.slurm $i $seed $j
+  done
+  seed=$((seed+1))
+done
diff --git a/script/submit_clothing1m.sh b/script/submit_clothing1m.sh
new file mode 100644
index 0000000..c6b3f37
--- /dev/null
+++ b/script/submit_clothing1m.sh
@@ -0,0 +1,25 @@
+# Clothing1M
+declare -a loss=( "ce"
+                  "gce"
+                  "nce+mae"
+                  "nce+rce"
+                  "nfl+mae"
+                  "nfl+rce"
+                  "sce" )
+
+declare -a run_version=(
+                        "clothing1m"
+                       )
+
+seed=0
+for i in "${run_version[@]}"
+do
+  for j in "${loss[@]}"
+  do
+    job_name=Clothing1M_${i}_${j}
+    echo $job_name
+    sbatch --partition gpgpu --cpus-per-task=8 --gres=gpu:4 Clothing1M.slurm $i $seed $j
+    # sbatch --partition gpgputest --qos=gpgpuhpcadmingpgpu --cpus-per-task=24 --gres=gpu:4 Clothing1M.slurm $i $seed $j
+  done
+  seed=$((seed+1))
+done
diff --git a/script/submit_mnist.sh b/script/submit_mnist.sh
new file mode 100644
index 0000000..8ec47d1
--- /dev/null
+++ b/script/submit_mnist.sh
@@ -0,0 +1,38 @@
+# MNIST
+declare -a loss=(
+                  "ce"
+                  "focal"
+                  "gce"
+                  "mae"
+                  "nce"
+                  "nce+mae"
+                  "nce+rce"
+                  "nfl"
+                  "nfl+mae"
+                  "nfl+rce"
+                  "ngce"
+                  "ngce+mae"
+                  "ngce+rce"
+                  # "nlnl"
+                  "rce"
+                  "sce"
+                )
+
+declare -a run_version=(
+                        "run1"
+                        "run2"
+                        "run3"
+                       )
+
+seed=0
+for i in "${run_version[@]}"
+do
+  for j in "${loss[@]}"
+  do
+    job_name=${j}_MNIST_${i}
+    echo $job_name
+    # sbatch --partition gpgputest --qos=gpgpuhpcadmingpgpu --job-name $job_name --cpus-per-task=8 --gres=gpu:1 MNIST.slurm $i $seed $j
+    sbatch --partition gpgpu --job-name $job_name --cpus-per-task=4 --gres=gpu:1 --mem=16G MNIST.slurm $i $seed $j
+  done
+  seed=$((seed+1))
+done
diff --git a/script/submit_webvision_mini.sh b/script/submit_webvision_mini.sh
new file mode 100644
index 0000000..2c1e55d
--- /dev/null
+++ b/script/submit_webvision_mini.sh
@@ -0,0 +1,28 @@
+# WebVision Mini
+declare -a loss=(
+                  "ce"
+                  "gce"
+                  # "nce+mae"
+                  # "nce+rce"
+                  # "nfl+mae"
+                  # "nfl+rce"
+                  "sce"
+                  )
+
+declare -a run_version=(
+                        "webvision_mini"
+                        )
+
+seed=0
+for i in "${run_version[@]}"
+do
+  for j in "${loss[@]}"
+  do
+    job_name=WebVisionMini${i}_${j}
+    echo $job_name
+    sbatch --partition gpgpu --job-name $job_name --cpus-per-task=8 --gres=gpu:4 --mem=96G WebVisionMini.slurm $i $seed $j
+    # sbatch --partition deeplearn --qos gpgpudeeplearn --job-name $job_name --cpus-per-task=8 --gres=gpu:4 --mem=96G WebVisionMini.slurm $i $seed $j
+    # sbatch --partition gpgputest --qos=gpgpuhpcadmingpgpu --job-name $job_name --cpus-per-task=24 --gres=gpu:4 WebVisionMini.slurm $i $seed $j
+  done
+  seed=$((seed+1))
+done
diff --git a/train_models.py b/train_models.py
deleted file mode 100644
index a7165e1..0000000
--- a/train_models.py
+++ /dev/null
@@ -1,199 +0,0 @@
-from __future__ import absolute_import
-from __future__ import print_function
-
-import os
-import keras.backend as K
-import argparse
-
-from keras.preprocessing.image import ImageDataGenerator
-from keras.optimizers import SGD
-from keras.callbacks import ModelCheckpoint
-
-from util import get_lr_scheduler, uniform_noise_model_P
-from datasets import get_data, validatation_split
-from models import get_model
-from loss import cross_entropy, boot_soft, boot_hard, forward, backward, lid_paced_loss
-from callback_util import D2LCallback, LoggerCallback
-
-D2L = {'mnist': {'init_epoch': 5, 'epoch_win': 5}, 'svhn': {'init_epoch': 20, 'epoch_win': 5},
-       'cifar-10': {'init_epoch': 40, 'epoch_win': 5}, 'cifar-100': {'init_epoch': 60, 'epoch_win': 5}}
-
-# prepare folders
-folders = ['data', 'model', 'log']
-for folder in folders:
-    path = os.path.join('./', folder)
-    if not os.path.exists(path):
-        os.makedirs(path)
-
-def train(dataset='mnist', model_name='d2l', batch_size=128, epochs=50, noise_ratio=0):
-    """
-    Train one model with data augmentation: random padding+cropping and horizontal flip
-    :param dataset: 
-    :param model_name:
-    :param batch_size: 
-    :param epochs: 
-    :param noise_ratio: 
-    :return: 
-    """
-    print('Dataset: %s, model: %s, batch: %s, epochs: %s, noise ratio: %s%%' %
-          (dataset, model_name, batch_size, epochs, noise_ratio))
-
-    # load data
-    X_train, y_train, X_test, y_test = get_data(dataset, noise_ratio, random_shuffle=True)
-    # X_train, y_train, X_val, y_val = validatation_split(X_train, y_train, split=0.1)
-    n_images = X_train.shape[0]
-    image_shape = X_train.shape[1:]
-    num_classes = y_train.shape[1]
-    print("n_images", n_images, "num_classes", num_classes, "image_shape:", image_shape)
-
-    # load model
-    model = get_model(dataset, input_tensor=None, input_shape=image_shape, num_classes=num_classes)
-    # model.summary()
-
-    if dataset == 'cifar-100':
-        optimizer = SGD(lr=0.1, decay=5e-3, momentum=0.9)
-    else:
-        optimizer = SGD(lr=0.1, decay=1e-4, momentum=0.9)
-
-    # for backward, forward loss
-    # suppose the model knows noise ratio
-    P = uniform_noise_model_P(num_classes, noise_ratio/100.)
-    # create loss
-    if model_name == 'forward':
-        P = uniform_noise_model_P(num_classes, noise_ratio / 100.)
-        loss = forward(P)
-    elif model_name == 'backward':
-        P = uniform_noise_model_P(num_classes, noise_ratio / 100.)
-        loss = backward(P)
-    elif model_name == 'boot_hard':
-        loss = boot_hard
-    elif model_name == 'boot_soft':
-        loss = boot_soft
-    elif model_name == 'd2l':
-        if dataset == 'cifar-100':
-            loss = lid_paced_loss(beta1=6.0, beta2=0.1)
-        else:
-            loss = lid_paced_loss(beta1=0.1, beta2=1.0)
-    else:
-        loss = cross_entropy
-
-    # model
-    model.compile(
-        loss=loss,
-        optimizer=optimizer,
-        metrics=['accuracy']
-    )
-
-    ## do real-time updates using callbakcs
-    callbacks = []
-    if model_name == 'd2l':
-        init_epoch = D2L[dataset]['init_epoch']
-        epoch_win = D2L[dataset]['epoch_win']
-        d2l_learning = D2LCallback(model, X_train, y_train,
-                                            dataset, noise_ratio,
-                                            epochs=epochs,
-                                            pace_type=model_name,
-                                            init_epoch=init_epoch,
-                                            epoch_win=epoch_win)
-
-        callbacks.append(d2l_learning)
-
-        cp_callback = ModelCheckpoint("model/%s_%s_%s.hdf5" % (model_name, dataset, noise_ratio),
-                                      monitor='val_loss',
-                                      verbose=0,
-                                      save_best_only=False,
-                                      save_weights_only=True,
-                                      period=1)
-        callbacks.append(cp_callback)
-
-    else:
-        cp_callback = ModelCheckpoint("model/%s_%s_%s.hdf5" % (model_name, dataset, noise_ratio),
-                                      monitor='val_loss',
-                                      verbose=0,
-                                      save_best_only=False,
-                                      save_weights_only=True,
-                                      period=epochs)
-        callbacks.append(cp_callback)
-
-    # learning rate scheduler if use sgd
-    lr_scheduler = get_lr_scheduler(dataset)
-    callbacks.append(lr_scheduler)
-
-    # acc, loss, lid
-    log_callback = LoggerCallback(model, X_train, y_train, X_test, y_test, dataset,
-                                  model_name, noise_ratio, epochs)
-    callbacks.append(log_callback)
-
-    # data augmentation
-    if dataset in ['mnist', 'svhn']:
-        datagen = ImageDataGenerator()
-    elif dataset in ['cifar-10']:
-        datagen = ImageDataGenerator(
-            width_shift_range=0.2,
-            height_shift_range=0.2,
-            horizontal_flip=True)
-    else:
-        datagen = ImageDataGenerator(
-            rotation_range=20,
-            width_shift_range=0.2,
-            height_shift_range=0.2,
-            horizontal_flip=True)
-    datagen.fit(X_train)
-
-    # train model
-    model.fit_generator(datagen.flow(X_train, y_train, batch_size=batch_size),
-                        steps_per_epoch=len(X_train) / batch_size, epochs=epochs,
-                        validation_data=(X_test, y_test),
-                        verbose=1,
-                        callbacks=callbacks
-                        )
-
-def main(args):
-    assert args.dataset in ['mnist', 'svhn', 'cifar-10', 'cifar-100'], \
-        "dataset parameter must be either 'mnist', 'svhn', 'cifar-10', 'cifar-100'"
-    assert args.model_name in ['ce', 'forward', 'backward', 'boot_hard', 'boot_soft', 'd2l'], \
-        "dataset parameter must be either 'ce', 'forward', 'backward', 'boot_hard', 'boot_soft', 'd2l'"
-    train(args.dataset, args.model_name, args.batch_size, args.epochs, args.noise_ratio)
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '-d', '--dataset',
-        help="Dataset to use; either 'mnist', 'svhn', 'cifar-10', 'cifar-100'",
-        required=True, type=str
-    )
-    parser.add_argument(
-        '-m', '--model_name',
-        help="Model name: 'ce', 'forward', 'backward', 'boot_hard', 'boot_soft', 'd2l'.",
-        required=True, type=str
-    )
-    parser.add_argument(
-        '-e', '--epochs',
-        help="The number of epochs to train for.",
-        required=False, type=int
-    )
-    parser.add_argument(
-        '-b', '--batch_size',
-        help="The batch size to use for training.",
-        required=False, type=int
-    )
-    parser.add_argument(
-        '-r', '--noise_ratio',
-        help="The percentage of noisy labels [0, 100].",
-        required=False, type=int
-    )
-    parser.set_defaults(epochs=150)
-    parser.set_defaults(batch_size=128)
-    parser.set_defaults(noise_ratio=0)
-
-    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
-
-#     args = parser.parse_args()
-#     main(args)
-
-    args = parser.parse_args(['-d', 'cifar-10', '-m', 'd2l',
-                                      '-e', '120', '-b', '128',
-                                      '-r', '60'])
-    main(args)
-
-    K.clear_session()
diff --git a/trainer.py b/trainer.py
new file mode 100644
index 0000000..322d843
--- /dev/null
+++ b/trainer.py
@@ -0,0 +1,83 @@
+import time
+import torch
+import os
+from util import log_display, accuracy, AverageMeter
+
+if torch.cuda.is_available():
+    torch.backends.cudnn.enabled = True
+    torch.backends.cudnn.benchmark = True
+    torch.backends.cudnn.deterministic = True
+    device = torch.device('cuda')
+else:
+    device = torch.device('cpu')
+
+
+class Trainer():
+    def __init__(self, data_loader, logger, config, name='Trainer', metrics='classfication'):
+        self.data_loader = data_loader
+        self.logger = logger
+        self.name = name
+        self.step = 0
+        self.config = config
+        self.log_frequency = config.log_frequency
+        self.loss_meters = AverageMeter()
+        self.acc_meters = AverageMeter()
+        self.acc5_meters = AverageMeter()
+        self.report_metrics = self.classfication_metrics if metrics == 'classfication' else self.regression_metrics
+
+    def train(self, epoch, GLOBAL_STEP, model, optimizer, criterion):
+        model.train()
+        for images, labels in self.data_loader:
+            images, labels = images.to(device, non_blocking=True), labels.to(device, non_blocking=True)
+            self.train_batch(images, labels, model, criterion, optimizer)
+            self.log(epoch, GLOBAL_STEP)
+            GLOBAL_STEP += 1
+        return GLOBAL_STEP
+
+    def train_batch(self, x, y, model, criterion, optimizer):
+        start = time.time()
+        model.zero_grad()
+        optimizer.zero_grad()
+        pred, _ = model(x)
+        loss = criterion(pred, y)
+        loss.backward()
+        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), self.config.grad_bound)
+        optimizer.step()
+        self.report_metrics(pred, y, loss)
+        self.logger_payload['lr'] = optimizer.param_groups[0]['lr'],
+        self.logger_payload['|gn|'] = grad_norm
+        end = time.time()
+        self.step += 1
+        self.time_used = end - start
+
+    def log(self, epoch, GLOBAL_STEP):
+        if GLOBAL_STEP % self.log_frequency == 0:
+            display = log_display(epoch=epoch,
+                                  global_step=GLOBAL_STEP,
+                                  time_elapse=self.time_used,
+                                  **self.logger_payload)
+            self.logger.info(display)
+
+    def classfication_metrics(self, x, y, loss):
+        acc, acc5 = accuracy(x, y, topk=(1, 5))
+        self.loss_meters.update(loss.item(), y.shape[0])
+        self.acc_meters.update(acc.item(), y.shape[0])
+        self.acc5_meters.update(acc5.item(), y.shape[0])
+        self.logger_payload = {"acc": acc,
+                               "acc_avg": self.acc_meters.avg,
+                               "loss": loss,
+                               "loss_avg": self.loss_meters.avg}
+
+    def regression_metrics(self, x, y, loss):
+        diff = abs((x - y).mean().detach().item())
+        self.loss_meters.update(loss.item(), y.shape[0])
+        self.acc_meters.update(diff, y.shape[0])
+        self.logger_payload = {"|diff|": diff,
+                               "|diff_avg|": self.acc_meters.avg,
+                               "loss": loss,
+                               "loss_avg": self.loss_meters.avg}
+
+    def _reset_stats(self):
+        self.loss_meters.reset()
+        self.acc_meters.reset()
+        self.acc5_meters.reset()
diff --git a/util.py b/util.py
index 4ed9a45..c537396 100644
--- a/util.py
+++ b/util.py
@@ -1,245 +1,147 @@
-from __future__ import absolute_import
-from __future__ import print_function
-
+import logging
 import os
-import multiprocessing as mp
-from subprocess import call
-import warnings
+import torch
 import numpy as np
-from numpy.testing import assert_array_almost_equal
-from sklearn.preprocessing import MinMaxScaler
-import keras.backend as K
-from scipy.spatial.distance import pdist, cdist, squareform
-from keras.callbacks import ModelCheckpoint, Callback
-from keras.callbacks import LearningRateScheduler
-import tensorflow as tf
-
-# Set random seed
-np.random.seed(123)
-
-
-def lid(logits, k=20):
-    """
-    Calculate LID for a minibatch of training samples based on the outputs of the network.
-
-    :param logits:
-    :param k: 
-    :return: 
-    """
-    epsilon = 1e-12
-    batch_size = tf.shape(logits)[0]
-    # n_samples = logits.get_shape().as_list()
-    # calculate pairwise distance
-    r = tf.reduce_sum(logits * logits, 1)
-    # turn r into column vector
-    r1 = tf.reshape(r, [-1, 1])
-    D = r1 - 2 * tf.matmul(logits, tf.transpose(logits)) + tf.transpose(r1) + \
-        tf.ones([batch_size, batch_size])
-
-    # find the k nearest neighbor
-    D1 = -tf.sqrt(D)
-    D2, _ = tf.nn.top_k(D1, k=k, sorted=True)
-    D3 = -D2[:, 1:]  # skip the x-to-x distance 0 by using [,1:]
-
-    m = tf.transpose(tf.multiply(tf.transpose(D3), 1.0 / D3[:, -1]))
-    v_log = tf.reduce_sum(tf.log(m + epsilon), axis=1)  # to avoid nan
-    lids = -k / v_log
-    return lids
-
-
-def mle_single(data, x, k):
-    """
-    lid of a single query point x.
-    numpy implementation.
-
-    :param data: 
-    :param x: 
-    :param k: 
-    :return: 
-    """
-    data = np.asarray(data, dtype=np.float32)
-    x = np.asarray(x, dtype=np.float32)
-    if x.ndim == 1:
-        x = x.reshape((-1, x.shape[0]))
-    # dim = x.shape[1]
-
-    k = min(k, len(data) - 1)
-    f = lambda v: - k / np.sum(np.log(v / v[-1] + 1e-8))
-    a = cdist(x, data)
-    a = np.apply_along_axis(np.sort, axis=1, arr=a)[:, 1:k + 1]
-    a = np.apply_along_axis(f, axis=1, arr=a)
-    return a[0]
-
-
-def mle_batch(data, batch, k):
-    """
-    lid of a batch of query points X.
-    numpy implementation.
-
-    :param data: 
-    :param batch: 
-    :param k: 
-    :return: 
-    """
-    data = np.asarray(data, dtype=np.float32)
-    batch = np.asarray(batch, dtype=np.float32)
-
-    k = min(k, len(data) - 1)
-    f = lambda v: - k / np.sum(np.log(v / v[-1] + 1e-8))
-    a = cdist(batch, data)
-    a = np.apply_along_axis(np.sort, axis=1, arr=a)[:, 1:k + 1]
-    a = np.apply_along_axis(f, axis=1, arr=a)
-    return a
-
-
-def other_class(n_classes, current_class):
-    """
-    Returns a list of class indices excluding the class indexed by class_ind
-    :param nb_classes: number of classes in the task
-    :param class_ind: the class index to be omitted
-    :return: one random class that != class_ind
-    """
-    if current_class < 0 or current_class >= n_classes:
-        error_str = "class_ind must be within the range (0, nb_classes - 1)"
-        raise ValueError(error_str)
-
-    other_class_list = list(range(n_classes))
-    other_class_list.remove(current_class)
-    other_class = np.random.choice(other_class_list)
-    return other_class
-
-
-def get_lids_random_batch(model, X, k=20, batch_size=128):
+import torch.nn.functional as F
+from lid import lid_mle
+from lass import lass
+
+class AverageMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.cnt += n
+        self.avg = self.sum / self.cnt
+
+
+def accuracy(output, target, topk=(1,)):
+    maxk = max(topk)
+    batch_size = target.size(0)
+
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+    res = []
+    for k in topk:
+        correct_k = correct[:k].contiguous().view(-1).float().sum(0)
+        res.append(correct_k.mul_(1.0/batch_size))
+    return res
+
+
+def log_display(epoch, global_step, time_elapse, **kwargs):
+    display = 'epoch=' + str(epoch) + \
+              '\tglobal_step=' + str(global_step)
+    for key, value in kwargs.items():
+        display += '\t' + str(key) + '=%.5f' % value
+    display += '\ttime=%.2fit/s' % (1. / time_elapse)
+    return display
+
+
+def chunks(l, n):
+    """Yield successive n-sized chunks from l."""
+    for i in range(0, len(l), n):
+        yield l[i:i + n]
+
+
+def setup_logger(name, log_file, level=logging.INFO):
+    """To setup as many loggers as you want"""
+    formatter = logging.Formatter('%(asctime)s %(message)s')
+    console_handler = logging.StreamHandler()
+    console_handler.setFormatter(formatter)
+    file_handler = logging.FileHandler(log_file)
+    file_handler.setFormatter(formatter)
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+    logger.addHandler(file_handler)
+    logger.addHandler(console_handler)
+    return logger
+
+
+def build_dirs(path):
+    if not os.path.exists(path):
+        os.makedirs(path)
+    return
+
+
+def save_model(filename, model, optimizer, scheduler, epoch, **kwargs):
+    # Torch Save State Dict
+    state = {
+        'epoch': epoch+1,
+        'model': model.state_dict() if model is not None else None,
+        'optimizer': optimizer.state_dict() if optimizer is not None else None,
+        'scheduler': scheduler.state_dict() if scheduler is not None else None,
+    }
+    for key, value in kwargs.items():
+        state[key] = value
+    torch.save(state, filename+'.pth')
+    return
+
+
+def load_model(filename, model, optimizer, scheduler, **kwargs):
+    checkpoints = torch.load(filename + '.pth')
+    if model is not None and checkpoints['model'] is not None:
+        model.load_state_dict(checkpoints['model'])
+    if optimizer is not None and checkpoints['optimizer'] is not None:
+        optimizer.load_state_dict(checkpoints['optimizer'])
+    if scheduler is not None and checkpoints['scheduler'] is not None:
+        scheduler.load_state_dict(checkpoints['scheduler'])
+    print("%s Loaded!" % (filename))
+    return checkpoints
+
+
+def count_parameters_in_MB(model):
+    return sum(np.prod(v.size()) for name, v in model.named_parameters() if "auxiliary_head" not in name)/1e6
+
+
+def get_lids_random_batch(model, data_loader, device, k=20, batch_size=128, batch_num=10):
     """
     Get the local intrinsic dimensionality of each Xi in X_adv
     estimated by k close neighbours in the random batch it lies in.
-    :param model: if None: lid of raw inputs, otherwise LID of deep representations 
-    :param X: normal images 
-    :param k: the number of nearest neighbours for LID estimation  
-    :param batch_size: default 100
-    :return: lids: LID of normal images of shape (num_examples, lid_dim)
-            lids_adv: LID of advs images of shape (num_examples, lid_dim)
     """
-    if model is None:
-        lids = []
-        n_batches = int(np.ceil(X.shape[0] / float(batch_size)))
-        for i_batch in range(n_batches):
-            start = i_batch * batch_size
-            end = np.minimum(len(X), (i_batch + 1) * batch_size)
-            X_batch = X[start:end].reshape((end - start, -1))
-
-            # Maximum likelihood estimation of local intrinsic dimensionality (LID)
-            lid_batch = mle_batch(X_batch, X_batch, k=k)
-            lids.extend(lid_batch)
-
-        lids = np.asarray(lids, dtype=np.float32)
-        return lids
-
-    # get deep representations
-    funcs = [K.function([model.layers[0].input, K.learning_phase()], [out])
-             for out in [model.get_layer("lid").output]]
-    lid_dim = len(funcs)
-
-    #     print("Number of layers to estimate: ", lid_dim)
-
-    def estimate(i_batch):
-        start = i_batch * batch_size
-        end = np.minimum(len(X), (i_batch + 1) * batch_size)
-        n_feed = end - start
-        lid_batch = np.zeros(shape=(n_feed, lid_dim))
-        for i, func in enumerate(funcs):
-            X_act = func([X[start:end], 0])[0]
-            X_act = np.asarray(X_act, dtype=np.float32).reshape((n_feed, -1))
-
-            # Maximum likelihood estimation of local intrinsic dimensionality (LID)
-            lid_batch[:, i] = mle_batch(X_act, X_act, k=k)
-
-        return lid_batch
 
     lids = []
-    n_batches = int(np.ceil(X.shape[0] / float(batch_size)))
-    for i_batch in range(n_batches):
-        lid_batch = estimate(i_batch)
-        lids.extend(lid_batch)
+    model.eval()
+    
+    def estimate(images):
+        images = images.to(device, non_blocking = True)
+        #get the output of the second-to-last layer of the network
+        with torch.no_grad():
+            _, X_act = model(images)
+            
+        lid_batch = lid_mle(X_act, X_act, k=k)
+        return lid_batch
 
-    lids = np.asarray(lids, dtype=np.float32)
+    
+    for j, (images,labels) in enumerate(data_loader['train_dataset']):
+        if j < batch_num:
+            lid_batch = estimate(images)
+            lids.extend(lid_batch)
 
+    lids = torch.stack(lids, dim=0).type(torch.float32)
     return lids
 
-
-def get_lr_scheduler(dataset):
-    """
-    customerized learning rate decay for training with clean labels.
-     For efficientcy purpose we use large lr for noisy data.
-    :param dataset: 
-    :param noise_ratio:
-    :return: 
-    """
-    if dataset in ['mnist', 'svhn']:
-        def scheduler(epoch):
-            if epoch > 40:
-                return 0.001
-            elif epoch > 20:
-                return 0.01
-            else:
-                return 0.1
-
-        return LearningRateScheduler(scheduler)
-    elif dataset in ['cifar-10']:
-        def scheduler(epoch):
-            if epoch > 80:
-                return 0.001
-            elif epoch > 40:
-                return 0.01
-            else:
-                return 0.1
-
-        return LearningRateScheduler(scheduler)
-    elif dataset in ['cifar-100']:
-        def scheduler(epoch):
-            if epoch > 120:
-                return 0.001
-            elif epoch > 80:
-                return 0.01
-            else:
-                return 0.1
-
-        return LearningRateScheduler(scheduler)
-
-
-def uniform_noise_model_P(num_classes, noise):
-    """ The noise matrix flips any class to any other with probability
-    noise / (num_classes - 1).
-    """
-
-    assert (noise >= 0.) and (noise <= 1.)
-
-    P = noise / (num_classes - 1) * np.ones((num_classes, num_classes))
-    np.fill_diagonal(P, (1 - noise) * np.ones(num_classes))
-
-    assert_array_almost_equal(P.sum(axis=1), 1, 1)
-    return P
-
-
-def get_deep_representations(model, X, batch_size=128):
-    """
-    Get the deep representations before logits.
-    :param model:
-    :param X:
-    :param batch_size:
-    :return:
-    """
-    # last hidden layer is always at index -4
-    output_dim = model.layers[-3].output.shape[-1].value
-    get_encoding = K.function(
-        [model.layers[0].input, K.learning_phase()],
-        [model.layers[-3].output]
-    )
-
-    n_batches = int(np.ceil(X.shape[0] / float(batch_size)))
-    output = np.zeros(shape=(len(X), output_dim))
-    for i in range(n_batches):
-        output[i * batch_size:(i + 1) * batch_size] = \
-            get_encoding([X[i * batch_size:(i + 1) * batch_size], 0])[0]
-
-    return output
+def get_csr_random_batch(model, data_loader, device, batch_size=128, batch_num=4):
+    model.eval()
+    adv_ind_sum = 0
+    for j, (images,labels) in enumerate(data_loader['test_dataset']):
+        if j < batch_num:
+            images = images.to(device, non_blocking = True)
+            scale_factor = 255. / (torch.max(images) - torch.min(images))
+            #scale_factor = 1
+            csr_model = lass(model, device, a=0.25 / scale_factor, b=0.2 / scale_factor, r=0.3 / scale_factor, iter_max=100)
+            X_adv, adv_ind = csr_model.find(images)
+            adv_ind_sum += torch.sum(adv_ind)
+            
+    samples_num = batch_num * batch_size
+    csr = adv_ind_sum * 1. / samples_num
+    return csr
+            
\ No newline at end of file