diff --git a/eval_translation.py b/eval_translation.py
index 11acfae..68bc0a4 100644
--- a/eval_translation.py
+++ b/eval_translation.py
@@ -58,6 +58,7 @@ def main():
     parser.add_argument('--seed', type=int, default=0, help='the random seed')
     parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)')
     parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)')
+    parser.add_argument('--device_id', default=None, type=int, help='device ID used for the --cuda option (defaults to 0)')
     args = parser.parse_args()
 
     # Choose the right dtype for the desired precision
@@ -80,6 +81,8 @@ def main():
             print('ERROR: Install CuPy for CUDA support', file=sys.stderr)
             sys.exit(-1)
         xp = get_cupy()
+        if args.device_id is not None:
+            xp.cuda.Device(args.device_id).use()
         x = xp.asarray(x)
         z = xp.asarray(z)
     else:
diff --git a/map_embeddings.py b/map_embeddings.py
index 4882504..9a9014f 100644
--- a/map_embeddings.py
+++ b/map_embeddings.py
@@ -13,42 +13,9 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
-import embeddings
-from cupy_utils import *
-
 import argparse
-import collections
-import numpy as np
-import re
 import sys
-import time
-
-
-def dropout(m, p):
-    if p <= 0.0:
-        return m
-    else:
-        xp = get_array_module(m)
-        mask = xp.random.rand(*m.shape) >= p
-        return m*mask
-
-
-def topk_mean(m, k, inplace=False):  # TODO Assuming that axis is 1
-    xp = get_array_module(m)
-    n = m.shape[0]
-    ans = xp.zeros(n, dtype=m.dtype)
-    if k <= 0:
-        return ans
-    if not inplace:
-        m = xp.array(m)
-    ind0 = xp.arange(n)
-    ind1 = xp.empty(n, dtype=int)
-    minimum = m.min()
-    for i in range(k):
-        m.argmax(axis=1, out=ind1)
-        ans += m[ind0, ind1]
-        m[ind0, ind1] = minimum
-    return ans / k
+from vecmap import VecMap
 
 
 def main():
@@ -59,8 +26,10 @@ def main():
     parser.add_argument('src_output', help='the output source embeddings')
     parser.add_argument('trg_output', help='the output target embeddings')
     parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)')
-    parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)')
+    parser.add_argument('--precision', choices=['float16', 'float32', 'float64'], default='float32', help='the floating-point precision (defaults to float32)')
     parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)')
+    parser.add_argument('--device_id', default=None, type=int, help='device ID used for the --cuda option (defaults to 0)')
+    parser.add_argument('--pca', action='store_true', help='use pca first')
     parser.add_argument('--batch_size', default=10000, type=int, help='batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory')
     parser.add_argument('--seed', type=int, default=0, help='the random seed (defaults to 0)')
 
@@ -82,7 +51,7 @@ def main():
     init_type.add_argument('--init_identical', action='store_true', help='use identical words as the seed dictionary')
     init_type.add_argument('--init_numerals', action='store_true', help='use latin numerals (i.e. words matching [0-9]+) as the seed dictionary')
     init_type.add_argument('--init_unsupervised', action='store_true', help='use unsupervised initialization')
-    init_group.add_argument('--unsupervised_vocab', type=int, default=0, help='restrict the vocabulary to the top k entries for unsupervised initialization')
+    init_group.add_argument('--unsupervised_vocab', type=int, default=None, help='restrict the vocabulary to the top k entries for unsupervised initialization')
 
     mapping_group = parser.add_argument_group('advanced mapping arguments', 'Advanced embedding mapping arguments')
     mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'none'], nargs='*', default=[], help='the normalization actions to perform in order')
@@ -91,14 +60,14 @@ def main():
     mapping_group.add_argument('--trg_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the target language embeddings')
     mapping_group.add_argument('--src_dewhiten', choices=['src', 'trg'], help='de-whiten the source language embeddings')
     mapping_group.add_argument('--trg_dewhiten', choices=['src', 'trg'], help='de-whiten the target language embeddings')
-    mapping_group.add_argument('--dim_reduction', type=int, default=0, help='apply dimensionality reduction')
+    mapping_group.add_argument('--dim_reduction', type=int, default=None, help='apply dimensionality reduction')
     mapping_type = mapping_group.add_mutually_exclusive_group()
     mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping')
     mapping_type.add_argument('-u', '--unconstrained', action='store_true', help='use unconstrained mapping')
 
     self_learning_group = parser.add_argument_group('advanced self-learning arguments', 'Advanced arguments for self-learning')
     self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning')
-    self_learning_group.add_argument('--vocabulary_cutoff', type=int, default=0, help='restrict the vocabulary to the top k entries')
+    self_learning_group.add_argument('--vocabulary_cutoff', type=int, default=None, help='restrict the vocabulary to the top k entries')
     self_learning_group.add_argument('--direction', choices=['forward', 'backward', 'union'], default='union', help='the direction for dictionary induction (defaults to union)')
     self_learning_group.add_argument('--csls', type=int, nargs='?', default=0, const=10, metavar='NEIGHBORHOOD_SIZE', dest='csls_neighborhood', help='use CSLS for dictionary induction')
     self_learning_group.add_argument('--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)')
@@ -106,8 +75,6 @@ def main():
     self_learning_group.add_argument('--stochastic_initial', default=0.1, type=float, help='initial keep probability stochastic dictionary induction (defaults to 0.1)')
     self_learning_group.add_argument('--stochastic_multiplier', default=2.0, type=float, help='stochastic dictionary induction multiplier (defaults to 2.0)')
     self_learning_group.add_argument('--stochastic_interval', default=50, type=int, help='stochastic dictionary induction interval (defaults to 50)')
-    self_learning_group.add_argument('--log', help='write to a log file in tsv format at each iteration')
-    self_learning_group.add_argument('-v', '--verbose', action='store_true', help='write log information to stderr at each iteration')
     args = parser.parse_args()
 
     if args.supervised is not None:
@@ -128,295 +95,55 @@ def main():
         parser.set_defaults(init_dictionary=args.emnlp2016, orthogonal=True, normalize=['unit', 'center'], batch_size=1000)
     args = parser.parse_args()
 
-    # Check command line arguments
-    if (args.src_dewhiten is not None or args.trg_dewhiten is not None) and not args.whiten:
-        print('ERROR: De-whitening requires whitening first', file=sys.stderr)
-        sys.exit(-1)
-
-    # Choose the right dtype for the desired precision
-    if args.precision == 'fp16':
-        dtype = 'float16'
-    elif args.precision == 'fp32':
-        dtype = 'float32'
-    elif args.precision == 'fp64':
-        dtype = 'float64'
 
-    # Read input embeddings
-    srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape')
-    trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape')
-    src_words, x = embeddings.read(srcfile, dtype=dtype)
-    trg_words, z = embeddings.read(trgfile, dtype=dtype)
-
-    # NumPy/CuPy management
+    training_mode = 'orthogonal' if args.orthogonal else \
+                    'unconstrained' if args.unconstrained else \
+                    'advanced'
+    init_dictionary_mode =  'unsupervised' if args.init_unsupervised else \
+                            'identical' if args.init_identical else \
+                            'numerals' if args.init_numerals else \
+                            'seed'
+    vecmap = VecMap(
+        training_mode = training_mode,
+        whiten = args.whiten,
+        src_reweight = args.src_reweight,
+        trg_reweight = args.trg_reweight,
+        src_dewhiten = args.src_dewhiten,
+        trg_dewhiten = args.trg_dewhiten,
+        dim_reduction = args.dim_reduction,
+        init_dictionary_mode = init_dictionary_mode,
+        dictionary_induction_direction = args.direction,
+        unsupervised_dictionary_size = args.unsupervised_vocab,
+        vocabulary_cutoff = args.vocabulary_cutoff,
+        normalization_actions = args.normalize,
+        file_encodings = args.encoding,
+        csls = args.csls_neighborhood,
+        dtype = args.precision,
+    )
     if args.cuda:
-        if not supports_cupy():
-            print('ERROR: Install CuPy for CUDA support', file=sys.stderr)
-            sys.exit(-1)
-        xp = get_cupy()
-        x = xp.asarray(x)
-        z = xp.asarray(z)
-    else:
-        xp = np
-    xp.random.seed(args.seed)
-
-    # Build word to index map
-    src_word2ind = {word: i for i, word in enumerate(src_words)}
-    trg_word2ind = {word: i for i, word in enumerate(trg_words)}
-
-    # STEP 0: Normalization
-    embeddings.normalize(x, args.normalize)
-    embeddings.normalize(z, args.normalize)
-
-    # Build the seed dictionary
-    src_indices = []
-    trg_indices = []
-    if args.init_unsupervised:
-        sim_size = min(x.shape[0], z.shape[0]) if args.unsupervised_vocab <= 0 else min(x.shape[0], z.shape[0], args.unsupervised_vocab)
-        u, s, vt = xp.linalg.svd(x[:sim_size], full_matrices=False)
-        xsim = (u*s).dot(u.T)
-        u, s, vt = xp.linalg.svd(z[:sim_size], full_matrices=False)
-        zsim = (u*s).dot(u.T)
-        del u, s, vt
-        xsim.sort(axis=1)
-        zsim.sort(axis=1)
-        embeddings.normalize(xsim, args.normalize)
-        embeddings.normalize(zsim, args.normalize)
-        sim = xsim.dot(zsim.T)
-        if args.csls_neighborhood > 0:
-            knn_sim_fwd = topk_mean(sim, k=args.csls_neighborhood)
-            knn_sim_bwd = topk_mean(sim.T, k=args.csls_neighborhood)
-            sim -= knn_sim_fwd[:, xp.newaxis]/2 + knn_sim_bwd/2
-        if args.direction == 'forward':
-            src_indices = xp.arange(sim_size)
-            trg_indices = sim.argmax(axis=1)
-        elif args.direction == 'backward':
-            src_indices = sim.argmax(axis=0)
-            trg_indices = xp.arange(sim_size)
-        elif args.direction == 'union':
-            src_indices = xp.concatenate((xp.arange(sim_size), sim.argmax(axis=0)))
-            trg_indices = xp.concatenate((sim.argmax(axis=1), xp.arange(sim_size)))
-        del xsim, zsim, sim
-    elif args.init_numerals:
-        numeral_regex = re.compile('^[0-9]+$')
-        src_numerals = {word for word in src_words if numeral_regex.match(word) is not None}
-        trg_numerals = {word for word in trg_words if numeral_regex.match(word) is not None}
-        numerals = src_numerals.intersection(trg_numerals)
-        for word in numerals:
-            src_indices.append(src_word2ind[word])
-            trg_indices.append(trg_word2ind[word])
-    elif args.init_identical:
-        identical = set(src_words).intersection(set(trg_words))
-        for word in identical:
-            src_indices.append(src_word2ind[word])
-            trg_indices.append(trg_word2ind[word])
-    else:
-        f = open(args.init_dictionary, encoding=args.encoding, errors='surrogateescape')
-        for line in f:
-            src, trg = line.split()
-            try:
-                src_ind = src_word2ind[src]
-                trg_ind = trg_word2ind[trg]
-                src_indices.append(src_ind)
-                trg_indices.append(trg_ind)
-            except KeyError:
-                print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr)
-
-    # Read validation dictionary
+        vecmap.to_cuda(args.device_id)
+    vecmap.set_seed(args.seed)
+    vecmap.set_train_data(
+        src_input = args.src_input,
+        trg_input = args.trg_input,
+        seed_dictionary = args.init_dictionary,
+        pca = args.pca,
+    )
     if args.validation is not None:
-        f = open(args.validation, encoding=args.encoding, errors='surrogateescape')
-        validation = collections.defaultdict(set)
-        oov = set()
-        vocab = set()
-        for line in f:
-            src, trg = line.split()
-            try:
-                src_ind = src_word2ind[src]
-                trg_ind = trg_word2ind[trg]
-                validation[src_ind].add(trg_ind)
-                vocab.add(src)
-            except KeyError:
-                oov.add(src)
-        oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
-        validation_coverage = len(validation) / (len(validation) + len(oov))
-
-    # Create log file
-    if args.log:
-        log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape')
-
-    # Allocate memory
-    xw = xp.empty_like(x)
-    zw = xp.empty_like(z)
-    src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min(x.shape[0], args.vocabulary_cutoff)
-    trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min(z.shape[0], args.vocabulary_cutoff)
-    simfwd = xp.empty((args.batch_size, trg_size), dtype=dtype)
-    simbwd = xp.empty((args.batch_size, src_size), dtype=dtype)
-    if args.validation is not None:
-        simval = xp.empty((len(validation.keys()), z.shape[0]), dtype=dtype)
-
-    best_sim_forward = xp.full(src_size, -100, dtype=dtype)
-    src_indices_forward = xp.arange(src_size)
-    trg_indices_forward = xp.zeros(src_size, dtype=int)
-    best_sim_backward = xp.full(trg_size, -100, dtype=dtype)
-    src_indices_backward = xp.zeros(trg_size, dtype=int)
-    trg_indices_backward = xp.arange(trg_size)
-    knn_sim_fwd = xp.zeros(src_size, dtype=dtype)
-    knn_sim_bwd = xp.zeros(trg_size, dtype=dtype)
-
-    # Training loop
-    best_objective = objective = -100.
-    it = 1
-    last_improvement = 0
-    keep_prob = args.stochastic_initial
-    t = time.time()
-    end = not args.self_learning
-    while True:
-
-        # Increase the keep probability if we have not improve in args.stochastic_interval iterations
-        if it - last_improvement > args.stochastic_interval:
-            if keep_prob >= 1.0:
-                end = True
-            keep_prob = min(1.0, args.stochastic_multiplier*keep_prob)
-            last_improvement = it
-
-        # Update the embedding mapping
-        if args.orthogonal or not end:  # orthogonal mapping
-            u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices]))
-            w = vt.T.dot(u.T)
-            x.dot(w, out=xw)
-            zw[:] = z
-        elif args.unconstrained:  # unconstrained mapping
-            x_pseudoinv = xp.linalg.inv(x[src_indices].T.dot(x[src_indices])).dot(x[src_indices].T)
-            w = x_pseudoinv.dot(z[trg_indices])
-            x.dot(w, out=xw)
-            zw[:] = z
-        else:  # advanced mapping
-
-            # TODO xw.dot(wx2, out=xw) and alike not working
-            xw[:] = x
-            zw[:] = z
-
-            # STEP 1: Whitening
-            def whitening_transformation(m):
-                u, s, vt = xp.linalg.svd(m, full_matrices=False)
-                return vt.T.dot(xp.diag(1/s)).dot(vt)
-            if args.whiten:
-                wx1 = whitening_transformation(xw[src_indices])
-                wz1 = whitening_transformation(zw[trg_indices])
-                xw = xw.dot(wx1)
-                zw = zw.dot(wz1)
-
-            # STEP 2: Orthogonal mapping
-            wx2, s, wz2_t = xp.linalg.svd(xw[src_indices].T.dot(zw[trg_indices]))
-            wz2 = wz2_t.T
-            xw = xw.dot(wx2)
-            zw = zw.dot(wz2)
-
-            # STEP 3: Re-weighting
-            xw *= s**args.src_reweight
-            zw *= s**args.trg_reweight
-
-            # STEP 4: De-whitening
-            if args.src_dewhiten == 'src':
-                xw = xw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2))
-            elif args.src_dewhiten == 'trg':
-                xw = xw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2))
-            if args.trg_dewhiten == 'src':
-                zw = zw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2))
-            elif args.trg_dewhiten == 'trg':
-                zw = zw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2))
-
-            # STEP 5: Dimensionality reduction
-            if args.dim_reduction > 0:
-                xw = xw[:, :args.dim_reduction]
-                zw = zw[:, :args.dim_reduction]
-
-        # Self-learning
-        if end:
-            break
-        else:
-            # Update the training dictionary
-            if args.direction in ('forward', 'union'):
-                if args.csls_neighborhood > 0:
-                    for i in range(0, trg_size, simbwd.shape[0]):
-                        j = min(i + simbwd.shape[0], trg_size)
-                        zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i])
-                        knn_sim_bwd[i:j] = topk_mean(simbwd[:j-i], k=args.csls_neighborhood, inplace=True)
-                for i in range(0, src_size, simfwd.shape[0]):
-                    j = min(i + simfwd.shape[0], src_size)
-                    xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i])
-                    simfwd[:j-i].max(axis=1, out=best_sim_forward[i:j])
-                    simfwd[:j-i] -= knn_sim_bwd/2  # Equivalent to the real CSLS scores for NN
-                    dropout(simfwd[:j-i], 1 - keep_prob).argmax(axis=1, out=trg_indices_forward[i:j])
-            if args.direction in ('backward', 'union'):
-                if args.csls_neighborhood > 0:
-                    for i in range(0, src_size, simfwd.shape[0]):
-                        j = min(i + simfwd.shape[0], src_size)
-                        xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i])
-                        knn_sim_fwd[i:j] = topk_mean(simfwd[:j-i], k=args.csls_neighborhood, inplace=True)
-                for i in range(0, trg_size, simbwd.shape[0]):
-                    j = min(i + simbwd.shape[0], trg_size)
-                    zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i])
-                    simbwd[:j-i].max(axis=1, out=best_sim_backward[i:j])
-                    simbwd[:j-i] -= knn_sim_fwd/2  # Equivalent to the real CSLS scores for NN
-                    dropout(simbwd[:j-i], 1 - keep_prob).argmax(axis=1, out=src_indices_backward[i:j])
-            if args.direction == 'forward':
-                src_indices = src_indices_forward
-                trg_indices = trg_indices_forward
-            elif args.direction == 'backward':
-                src_indices = src_indices_backward
-                trg_indices = trg_indices_backward
-            elif args.direction == 'union':
-                src_indices = xp.concatenate((src_indices_forward, src_indices_backward))
-                trg_indices = xp.concatenate((trg_indices_forward, trg_indices_backward))
-
-            # Objective function evaluation
-            if args.direction == 'forward':
-                objective = xp.mean(best_sim_forward).tolist()
-            elif args.direction == 'backward':
-                objective = xp.mean(best_sim_backward).tolist()
-            elif args.direction == 'union':
-                objective = (xp.mean(best_sim_forward) + xp.mean(best_sim_backward)).tolist() / 2
-            if objective - best_objective >= args.threshold:
-                last_improvement = it
-                best_objective = objective
-
-            # Accuracy and similarity evaluation in validation
-            if args.validation is not None:
-                src = list(validation.keys())
-                xw[src].dot(zw.T, out=simval)
-                nn = asnumpy(simval.argmax(axis=1))
-                accuracy = np.mean([1 if nn[i] in validation[src[i]] else 0 for i in range(len(src))])
-                similarity = np.mean([max([simval[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src))])
-
-            # Logging
-            duration = time.time() - t
-            if args.verbose:
-                print(file=sys.stderr)
-                print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr)
-                print('\t- Objective:        {0:9.4f}%'.format(100 * objective), file=sys.stderr)
-                print('\t- Drop probability: {0:9.4f}%'.format(100 - 100*keep_prob), file=sys.stderr)
-                if args.validation is not None:
-                    print('\t- Val. similarity:  {0:9.4f}%'.format(100 * similarity), file=sys.stderr)
-                    print('\t- Val. accuracy:    {0:9.4f}%'.format(100 * accuracy), file=sys.stderr)
-                    print('\t- Val. coverage:    {0:9.4f}%'.format(100 * validation_coverage), file=sys.stderr)
-                sys.stderr.flush()
-            if args.log is not None:
-                val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format(
-                    100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else ''
-                print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(it, 100 * objective, val, duration), file=log)
-                log.flush()
-
-        t = time.time()
-        it += 1
-
-    # Write mapped embeddings
-    srcfile = open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape')
-    trgfile = open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape')
-    embeddings.write(src_words, xw, srcfile)
-    embeddings.write(trg_words, zw, trgfile)
-    srcfile.close()
-    trgfile.close()
-
+        vecmap.set_validation_dictionary(args.validation)
+    if args.self_learning:
+        vecmap.self_learning_train(
+            dict_update_batch_size = args.batch_size,
+            objective_threshold = args.threshold,
+            stochastic_interval = args.stochastic_interval,
+            stochastic_initial = args.stochastic_initial,
+        )
+    else:
+        vecmap.train()
+    vecmap.save_embeddings(
+        src_output = args.src_output,
+        trg_output = args.trg_output,
+    )
 
 if __name__ == '__main__':
     main()
diff --git a/vecmap.py b/vecmap.py
new file mode 100644
index 0000000..061e809
--- /dev/null
+++ b/vecmap.py
@@ -0,0 +1,444 @@
+import numpy as np
+from cupy_utils import supports_cupy, get_cupy, get_array_module, asnumpy
+import embeddings
+import re
+import time
+import collections
+from sklearn.decomposition import PCA
+
+
+def topk_mean(m, k, inplace=False):  # TODO Assuming that axis is 1
+    xp = get_array_module(m)
+    n = m.shape[0]
+    ans = xp.zeros(n, dtype=m.dtype)
+    if k <= 0:
+        return ans
+    if not inplace:
+        m = xp.array(m)
+    ind0 = xp.arange(n)
+    ind1 = xp.empty(n, dtype=int)
+    minimum = m.min()
+    for i in range(k):
+        m.argmax(axis=1, out=ind1)
+        ans += m[ind0, ind1]
+        m[ind0, ind1] = minimum
+    return ans / k
+
+
+def dropout(m, p):
+    if p <= 0.0:
+        return m
+    else:
+        xp = get_array_module(m)
+        mask = xp.random.rand(*m.shape) >= p
+        return m*mask
+
+
+class VecMap:
+    def __init__(
+        self,
+        training_mode : str = 'advanced', # 'orthogonal' or 'unconstrained' or 'advanced'
+        whiten : bool = False,
+        src_reweight : float = 0.,
+        trg_reweight : float = 0.,
+        src_dewhiten : str = None, # 'src' or 'trg'
+        trg_dewhiten : str = None, # 'src' or 'trg'
+        dim_reduction : int = None,
+        init_dictionary_mode : str = 'unsupervised', # 'unsupervised', 'numerals', 'identical', 'seed'
+        dictionary_induction_direction : str = 'union', # 'forward', 'backward', 'union'
+        unsupervised_dictionary_size : int = None,
+        vocabulary_cutoff : int = None,
+        normalization_actions : list = [], # ['unit', 'center', 'unitdim', 'centeremb', 'none']
+        file_encodings : str = 'utf-8',
+        csls : int = 0,
+        dtype : str = 'float32',
+    ) -> None:
+        VecMap._check_training_mode_validity(training_mode)
+        VecMap._check_dewhiten_mode_validity(src_dewhiten)
+        VecMap._check_dewhiten_mode_validity(trg_dewhiten)
+        VecMap._check_whitening_and_dewhitening_concordancy(whiten, src_dewhiten, trg_dewhiten)
+        VecMap._check_init_dictionary_mode(init_dictionary_mode)
+        VecMap._check_normalization_actions(normalization_actions)
+
+        self._training_mode = training_mode
+        self._whiten_is_active = whiten
+        self._src_dewhiten = src_dewhiten
+        self._trg_dewhiten = trg_dewhiten
+        self.x_rw = src_reweight
+        self.z_rw = trg_reweight
+        self.dim_reduction = dim_reduction
+        self._dictionary_induction_direction = dictionary_induction_direction
+        self._init_dictionary_mode = init_dictionary_mode
+        self._unsupervised_dictionary_size = np.inf if unsupervised_dictionary_size is None else unsupervised_dictionary_size
+        self._cutoff = np.inf if vocabulary_cutoff is None else vocabulary_cutoff
+        self.normalization_actions = normalization_actions
+        self.encoding = file_encodings
+        self.csls = csls
+        self.dtype = dtype
+        
+        self.dictionary = None
+        self.validation_set = None
+        self.xp = np
+
+
+    @staticmethod
+    def _check_dewhiten_mode_validity(mode):
+        if mode not in ['src', 'trg']:
+            raise Exception(f"De-whitening mode can be either 'src' or 'trg', not {mode}")
+    @staticmethod
+    def _check_whitening_and_dewhitening_concordancy(whiten, src_dewhiten, trg_dewhiten):
+        if (src_dewhiten is not None or trg_dewhiten is not None) and not whiten:
+            raise Exception('ERROR: De-whitening requires whitening first')
+    @staticmethod
+    def _check_training_mode_validity(mode):
+        if mode not in ['orthogonal', 'unconstrained', 'advanced']:
+            raise Exception(f"Training mode can be either 'orthogonal', 'unconstrained', or 'advanced', not {mode}")
+    @staticmethod
+    def _check_dictionary_induction_direction(direction):
+        if direction not in ['forward', 'backward', 'union']:
+            raise Exception(f"Dictionary induction direction can be either 'forward', 'backward', or 'union', not {direction}")
+    @staticmethod
+    def _check_init_dictionary_mode(mode):
+        if mode not in ['unsupervised', 'numerals', 'identical', 'seed']:
+            raise Exception(f"Init ictionary mode can be either 'unsupervised', 'numerals', 'identical', or 'seed', not {mode}")
+    @staticmethod
+    def _check_normalization_actions(actions):
+        for action in actions:
+            if action not in ['unit', 'center', 'unitdim', 'centeremb', 'none']:
+                raise Exception("Invalid normalization actions.")
+    @staticmethod
+    def _check_seed_dictionary(mode, path):
+        if mode=='seed' and path is None:
+            raise Exception("Init seed dictionary path is missing. Set the `seed_dictionary` argument.")
+
+    
+    def to_cuda(self, device_id=None):
+        if not supports_cupy():
+            raise Exception('ERROR: Install CuPy for CUDA support')
+        self.xp = get_cupy()
+        if device_id is not None:
+            self.xp.cuda.Device(device_id).use()
+
+    def set_seed(self, seed):
+        self.xp.random.seed(seed)
+
+
+    def _whitening_transformation(self, m):
+        u, s, vt = self.xp.linalg.svd(m, full_matrices=False)
+        return vt.T @ self.xp.diag(1/s) @ vt
+    
+
+    def _actual_whiten(self, a, indices):
+        w1 = self._whitening_transformation(a[indices])
+        return a @ w1, w1
+    
+
+    def _whiten(self, a, indices):
+        if self._whiten_is_active:
+            return self._actual_whiten(a, indices)
+        else:
+            return a, None
+    
+    
+    def _bidirectional_orthogonal_map(self, x, ix, z, iz):
+        wx, s, wz_t = self.xp.linalg.svd(x[ix].T @ z[iz])
+        wz = wz_t.T
+        size = min(wx.shape[0], wz.shape[0])
+        wx, wz = wx[:, :size], wz[:, :size]
+        xw, zw = x @ wx, z @ wz
+        return xw, zw, wx, wz, s
+    
+
+    def _reweight(self, a, s, rw=0):
+        return a * s**rw
+    
+
+    def _actual_dewhiten(self, a, w1, w2):
+        return a @ w2.T @ self.xp.linalg.inv(w1) @ w2
+    
+    
+    def _dewhiten(self, a, wx1, wz1, wx2, wz2, mode):
+        if mode is None:
+            return a
+        if mode=='src':
+            w1, w2 = wx1, wx2
+        if mode=='trg':
+            w1, w2 = wz1, wz2
+        return self._actual_dewhiten(a, w1, w2)
+    
+
+    def _dim_reduction(self, a):
+        return a[:, :self.dim_reduction]
+    
+
+    def _advanced_map(self, x, ix, z, iz):
+        xw = x.copy()
+        zw = z.copy()
+
+        xw, wx1 = self._whiten(xw, ix)
+        zw, wz1 = self._whiten(zw, iz)
+
+        xw, zw, wx2, wz2, s = self._bidirectional_orthogonal_map(xw, ix, zw, iz)
+
+        xw = self._reweight(xw, s, self.x_rw)
+        zw = self._reweight(zw, s, self.z_rw)
+
+        xw = self._dewhiten(xw, wx1, wz1, wx2, wz2, self._src_dewhiten)
+        zw = self._dewhiten(zw, wx1, wz1, wx2, wz2, self._trg_dewhiten)
+
+        xw = self._dim_reduction(xw)
+        zw = self._dim_reduction(zw)
+
+        return xw, zw
+    
+
+    def _orthogonal_map(self, x, ix, z, iz):
+        u, s, v_t = self.xp.linalg.svd(z[iz].T @ x[ix])
+        size = min(u.shape[0], v_t.shape[0])
+        u, v_t = u[:, :size], v_t[:size, :]
+        xw = x @ v_t.T @ u.T
+        zw = z.copy()
+        return xw, zw
+    
+
+    def _unconstrained_map(self, x, ix, z, iz):
+        w = self.xp.linalg.inv(x[ix].T @ x[ix]) @ x[ix].T @ z[iz]
+        xw = x @ w
+        zw = z.copy()
+        return xw, zw
+    
+
+    def _map(self, x, ix, z, iz, is_last_iteration):
+        if self._training_mode=='advanced':
+            if is_last_iteration:
+                return self._advanced_map(x, ix, z, iz)
+            else:
+                return self._orthogonal_map(x, ix, z, iz)
+        if self._training_mode=='orthogonal':
+            return self._orthogonal_map(x, ix, z, iz)
+        if self._training_mode=='unconstrained':
+            return self._unconstrained_map(x, ix, z, iz)
+        
+
+    def _get_indices(self, sim):
+        if self._dictionary_induction_direction=='forward':
+            return self.xp.arange(sim.shape[1]), sim.argmax(axis=1)
+        if self._dictionary_induction_direction=='backward':
+            return sim.argmax(axis=0), self.xp.arange(sim.shape[0])
+        if self._dictionary_induction_direction=='union':
+            return self.xp.concatenate((self.xp.arange(sim.shape[1]), sim.argmax(axis=0))), \
+                   self.xp.concatenate((sim.argmax(axis=1), self.xp.arange(sim.shape[0])))
+        
+
+    def _build_unsupervised_seed_dictionary(self):
+        size = min(self.x.shape[0], self.z.shape[0], self._unsupervised_dictionary_size)
+        u, s, vt = self.xp.linalg.svd(self.x[:size], full_matrices=False)
+        xsim = (u*s) @ u.T
+        u, s, vt = self.xp.linalg.svd(self.z[:size], full_matrices=False)
+        zsim = (u*s) @ u.T
+        del u, s, vt
+        xsim.sort(axis=1)
+        zsim.sort(axis=1)
+        embeddings.normalize(xsim, self.normalization_actions)
+        embeddings.normalize(zsim, self.normalization_actions)
+        sim = xsim @ zsim.T
+        if self.csls > 0:
+            knn_sim_fwd = topk_mean(sim, k=self.csls)
+            knn_sim_bwd = topk_mean(sim.T, k=self.csls)
+            sim -= knn_sim_fwd[:, self.xp.newaxis]/2 + knn_sim_bwd/2
+        return self._get_indices(sim)
+    
+
+    def _build_identical_seed_dictionary(self, x_words, z_words):
+        identicals = set(x_words).intersection(set(z_words))
+        xi, zi = [], []
+        for word in identicals:
+            xi.append(self.src_word2ind[word])
+            zi.append(self.trg_word2ind[word])
+        return xi, zi
+    
+
+    def _build_numerals_seed_dictionary(self):
+        numeral_regex = re.compile('^[0-9]+$')
+        x_numerals = {word for word in self.x_words if numeral_regex.match(word) is not None}
+        z_numerals = {word for word in self.z_words if numeral_regex.match(word) is not None}
+        return self._build_identical_seed_dictionary(x_numerals, z_numerals)
+
+
+    def _read_dictionary(self, path):
+        xi, zi = [], []
+        oov = []
+        with open(path, encoding=self.encoding, errors='surrogateescape') as f:
+            for line in f:
+                x_word, z_word = line.split()
+                try:
+                    x, z = self.src_word2ind[x_word], self.trg_word2ind[z_word]
+                    xi.append(x)
+                    zi.append(z)
+                except KeyError:
+                    oov.append((x_word, z_word))
+        return xi, zi, oov
+
+
+    def _build_seed_dictionary(self, seed_dictionary_path):
+        if self._init_dictionary_mode=='unsupervised':
+            return self._build_unsupervised_seed_dictionary()
+        if self._init_dictionary_mode=='numerals':
+            return self._build_numerals_seed_dictionary()
+        if self._init_dictionary_mode=='identical':
+            return self._build_identical_seed_dictionary(self.x_words, self.z_words)
+        if self._init_dictionary_mode=='seed':
+            xi, zi, _ = self._read_dictionary(seed_dictionary_path)
+            return xi, zi
+        
+    
+    def _select_indices_for_dictionary(self, x, x_size, z, z_size, keep_prob, batch_size):
+        knn_sim = self.xp.zeros(z_size, dtype=self.dtype)
+        if self.csls > 0:
+            for b in range(0, z_size, batch_size):
+                sim = z[b:b+batch_size] @ x[:x_size].T
+                knn_sim[b:b+batch_size] = topk_mean(sim, k=self.csls, inplace=True)
+        best_sim = self.xp.full(x_size, -100, dtype=self.dtype)
+        indices = self.xp.zeros(x_size, dtype=int)
+        for b in range(0, x_size, batch_size):
+            sim = x[b:b+batch_size] @ z[:z_size].T
+            best_sim[b:b+batch_size] = sim.max(axis=1)
+            sim -= knn_sim/2
+            indices[b:b+batch_size] = dropout(sim, 1 - keep_prob).argmax(axis=1)
+        return indices, self.xp.mean(best_sim)
+    
+
+    def _rebuild_dictionary(self, x, z, keep_prob, batch_size):
+        x_size = min(x.shape[0], self._cutoff)
+        z_size = min(z.shape[0], self._cutoff)
+        if self._dictionary_induction_direction=='forward':
+            xi = self.xp.arange(x_size)
+            zi, objective = self._select_indices_for_dictionary(x, x_size, z, z_size, keep_prob, batch_size)
+        if self._dictionary_induction_direction=='backward':
+            xi, objective = self._select_indices_for_dictionary(z, z_size, x, x_size, keep_prob, batch_size)
+            zi = self.xp.arange(z_size)
+        if self._dictionary_induction_direction=='union':
+            xi, objective1 = self._select_indices_for_dictionary(z, z_size, x, x_size, keep_prob, batch_size)
+            zi, objective2 = self._select_indices_for_dictionary(x, x_size, z, z_size, keep_prob, batch_size)
+            objective = (objective1 + objective2) / 2
+            xi = self.xp.concatenate((self.xp.arange(x_size), xi))
+            zi = self.xp.concatenate((zi, self.xp.arange(z_size)))
+        return (xi, zi), objective
+
+
+    def set_validation_dictionary(self, path):
+        xi, zi, _ = self._read_dictionary(path)
+        self.validation_set = collections.defaultdict(set)
+        for x, z in zip(xi, zi):
+            self.validation_set[x].add(z)
+        print(f"Validation set length: {len(self.validation_set)}")
+
+
+    def validate(self, x, z):
+        src = list(self.validation_set.keys())
+        simval = x[src] @ z.T
+        nn = asnumpy(simval.argmax(axis=1))
+        accuracy = np.mean([(nn[i] in self.validation_set[src[i]]) for i in range(len(src))])
+        similarity = np.mean([max([simval[i, j].tolist() for j in self.validation_set[src[i]]]) for i in range(len(src))])
+        return accuracy, similarity
+
+
+    def _log(self, itr, duration, objective, keep_prob, similarity=None, accuracy=None):
+        print(f'ITERATION {itr} - DURATION {duration}s:')
+        print(f'\t- Objective: {objective}')
+        print(f'\t- Drop probability: {1-keep_prob}')
+        if accuracy is not None:
+            print(f'\t- Val. similarity:  {similarity}')
+            print(f'\t- Val. accuracy:    {accuracy}')
+
+
+    def set_train_data(
+        self,
+        src_input : str,
+        trg_input : str,
+        seed_dictionary : str = None,
+        pca : bool = False,
+        n_components : int = None,
+    ) -> None:
+        VecMap._check_seed_dictionary(self._init_dictionary_mode, seed_dictionary)
+        with open(src_input, encoding=self.encoding, errors='surrogateescape') as srcfile:
+            self.x_words, x = embeddings.read(srcfile, dtype=self.dtype)
+        with open(trg_input, encoding=self.encoding, errors='surrogateescape') as trgfile:
+            self.z_words, z = embeddings.read(trgfile, dtype=self.dtype)
+        if pca:
+            if n_components is None:
+                n_components = min(x.shape[1], z.shape[1])
+            x = PCA(n_components=n_components, svd_solver='full').fit_transform(x)
+            z = PCA(n_components=n_components, svd_solver='full').fit_transform(z)
+        self.x = self.xp.asarray(x)
+        self.z = self.xp.asarray(z)
+        embeddings.normalize(self.x, self.normalization_actions)
+        embeddings.normalize(self.z, self.normalization_actions)
+        self.src_word2ind = {word: i for i, word in enumerate(self.x_words)}
+        self.trg_word2ind = {word: i for i, word in enumerate(self.z_words)}
+        self.ix, self.iz = self._build_seed_dictionary(seed_dictionary)
+
+
+    def train(
+        self,
+    ) -> None:
+        self.xw, self.zw = self._map(self.x, self.ix, self.z, self.iz, True)
+
+
+    def self_learning_train(
+        self,
+        dict_update_batch_size : int = 10000,
+        stochastic_initial : float = 0.1,
+        objective_threshold : float = 0.000001,
+        stochastic_interval : int = 50,
+        stochastic_multiplier : float = 2.,
+        log : bool = True,
+    ) -> None:
+        xw, zw = self.x.copy(), self.z.copy()
+
+        finish = False
+        best_objective = objective = -100
+        last_improvement = -1
+        t0 = None
+        keep_prob = stochastic_initial
+        
+        itr = 0
+        while not finish:
+            if itr:
+                (self.ix, self.iz), objective = self._rebuild_dictionary(xw, zw, keep_prob, dict_update_batch_size)
+
+                if log:
+                    accuracy, similarity = None, None
+                    if self.validation_set:
+                        accuracy, similarity = self.validate(xw, zw)
+
+                    duration = time.time() - t0 if t0 is not None else 0
+                    self._log(itr, duration, objective, keep_prob, similarity, accuracy)
+
+            if objective - best_objective >= objective_threshold:
+                last_improvement = itr
+                best_objective = objective
+            if keep_prob >= 1.0:
+                finish = True
+            if itr - last_improvement > stochastic_interval:
+                keep_prob = min(1.0, stochastic_multiplier*keep_prob)
+                last_improvement = itr
+
+            t0 = time.time()
+            xw, zw = self._map(self.x, self.ix, self.z, self.iz, finish)
+            
+            itr += 1
+
+        self.xw, self.zw = xw, zw
+        
+
+    def save_embeddings(
+        self,
+        src_output : str,
+        trg_output : str,
+    ) -> None:
+        for words, emb, file_name in [[self.x_words, self.xw, src_output],
+                                             [self.z_words, self.zw, trg_output]]:
+            with open(file_name, mode='w', encoding=self.encoding, errors='surrogateescape') as f:
+                embeddings.write(words, emb, f)
+