diff --git a/eval_translation.py b/eval_translation.py index 11acfae..68bc0a4 100644 --- a/eval_translation.py +++ b/eval_translation.py @@ -58,6 +58,7 @@ def main(): parser.add_argument('--seed', type=int, default=0, help='the random seed') parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)') parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)') + parser.add_argument('--device_id', default=None, type=int, help='device ID used for the --cuda option (defaults to 0)') args = parser.parse_args() # Choose the right dtype for the desired precision @@ -80,6 +81,8 @@ def main(): print('ERROR: Install CuPy for CUDA support', file=sys.stderr) sys.exit(-1) xp = get_cupy() + if args.device_id is not None: + xp.cuda.Device(args.device_id).use() x = xp.asarray(x) z = xp.asarray(z) else: diff --git a/map_embeddings.py b/map_embeddings.py index 4882504..9a9014f 100644 --- a/map_embeddings.py +++ b/map_embeddings.py @@ -13,42 +13,9 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -import embeddings -from cupy_utils import * - import argparse -import collections -import numpy as np -import re import sys -import time - - -def dropout(m, p): - if p <= 0.0: - return m - else: - xp = get_array_module(m) - mask = xp.random.rand(*m.shape) >= p - return m*mask - - -def topk_mean(m, k, inplace=False): # TODO Assuming that axis is 1 - xp = get_array_module(m) - n = m.shape[0] - ans = xp.zeros(n, dtype=m.dtype) - if k <= 0: - return ans - if not inplace: - m = xp.array(m) - ind0 = xp.arange(n) - ind1 = xp.empty(n, dtype=int) - minimum = m.min() - for i in range(k): - m.argmax(axis=1, out=ind1) - ans += m[ind0, ind1] - m[ind0, ind1] = minimum - return ans / k +from vecmap import VecMap def main(): @@ -59,8 +26,10 @@ def main(): parser.add_argument('src_output', help='the output source embeddings') parser.add_argument('trg_output', help='the output target embeddings') parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') - parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)') + parser.add_argument('--precision', choices=['float16', 'float32', 'float64'], default='float32', help='the floating-point precision (defaults to float32)') parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)') + parser.add_argument('--device_id', default=None, type=int, help='device ID used for the --cuda option (defaults to 0)') + parser.add_argument('--pca', action='store_true', help='use pca first') parser.add_argument('--batch_size', default=10000, type=int, help='batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory') parser.add_argument('--seed', type=int, default=0, help='the random seed (defaults to 0)') @@ -82,7 +51,7 @@ def main(): init_type.add_argument('--init_identical', action='store_true', help='use identical words as the seed dictionary') init_type.add_argument('--init_numerals', action='store_true', help='use latin numerals (i.e. words matching [0-9]+) as the seed dictionary') init_type.add_argument('--init_unsupervised', action='store_true', help='use unsupervised initialization') - init_group.add_argument('--unsupervised_vocab', type=int, default=0, help='restrict the vocabulary to the top k entries for unsupervised initialization') + init_group.add_argument('--unsupervised_vocab', type=int, default=None, help='restrict the vocabulary to the top k entries for unsupervised initialization') mapping_group = parser.add_argument_group('advanced mapping arguments', 'Advanced embedding mapping arguments') mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'none'], nargs='*', default=[], help='the normalization actions to perform in order') @@ -91,14 +60,14 @@ def main(): mapping_group.add_argument('--trg_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the target language embeddings') mapping_group.add_argument('--src_dewhiten', choices=['src', 'trg'], help='de-whiten the source language embeddings') mapping_group.add_argument('--trg_dewhiten', choices=['src', 'trg'], help='de-whiten the target language embeddings') - mapping_group.add_argument('--dim_reduction', type=int, default=0, help='apply dimensionality reduction') + mapping_group.add_argument('--dim_reduction', type=int, default=None, help='apply dimensionality reduction') mapping_type = mapping_group.add_mutually_exclusive_group() mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping') mapping_type.add_argument('-u', '--unconstrained', action='store_true', help='use unconstrained mapping') self_learning_group = parser.add_argument_group('advanced self-learning arguments', 'Advanced arguments for self-learning') self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning') - self_learning_group.add_argument('--vocabulary_cutoff', type=int, default=0, help='restrict the vocabulary to the top k entries') + self_learning_group.add_argument('--vocabulary_cutoff', type=int, default=None, help='restrict the vocabulary to the top k entries') self_learning_group.add_argument('--direction', choices=['forward', 'backward', 'union'], default='union', help='the direction for dictionary induction (defaults to union)') self_learning_group.add_argument('--csls', type=int, nargs='?', default=0, const=10, metavar='NEIGHBORHOOD_SIZE', dest='csls_neighborhood', help='use CSLS for dictionary induction') self_learning_group.add_argument('--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)') @@ -106,8 +75,6 @@ def main(): self_learning_group.add_argument('--stochastic_initial', default=0.1, type=float, help='initial keep probability stochastic dictionary induction (defaults to 0.1)') self_learning_group.add_argument('--stochastic_multiplier', default=2.0, type=float, help='stochastic dictionary induction multiplier (defaults to 2.0)') self_learning_group.add_argument('--stochastic_interval', default=50, type=int, help='stochastic dictionary induction interval (defaults to 50)') - self_learning_group.add_argument('--log', help='write to a log file in tsv format at each iteration') - self_learning_group.add_argument('-v', '--verbose', action='store_true', help='write log information to stderr at each iteration') args = parser.parse_args() if args.supervised is not None: @@ -128,295 +95,55 @@ def main(): parser.set_defaults(init_dictionary=args.emnlp2016, orthogonal=True, normalize=['unit', 'center'], batch_size=1000) args = parser.parse_args() - # Check command line arguments - if (args.src_dewhiten is not None or args.trg_dewhiten is not None) and not args.whiten: - print('ERROR: De-whitening requires whitening first', file=sys.stderr) - sys.exit(-1) - - # Choose the right dtype for the desired precision - if args.precision == 'fp16': - dtype = 'float16' - elif args.precision == 'fp32': - dtype = 'float32' - elif args.precision == 'fp64': - dtype = 'float64' - # Read input embeddings - srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') - trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') - src_words, x = embeddings.read(srcfile, dtype=dtype) - trg_words, z = embeddings.read(trgfile, dtype=dtype) - - # NumPy/CuPy management + training_mode = 'orthogonal' if args.orthogonal else \ + 'unconstrained' if args.unconstrained else \ + 'advanced' + init_dictionary_mode = 'unsupervised' if args.init_unsupervised else \ + 'identical' if args.init_identical else \ + 'numerals' if args.init_numerals else \ + 'seed' + vecmap = VecMap( + training_mode = training_mode, + whiten = args.whiten, + src_reweight = args.src_reweight, + trg_reweight = args.trg_reweight, + src_dewhiten = args.src_dewhiten, + trg_dewhiten = args.trg_dewhiten, + dim_reduction = args.dim_reduction, + init_dictionary_mode = init_dictionary_mode, + dictionary_induction_direction = args.direction, + unsupervised_dictionary_size = args.unsupervised_vocab, + vocabulary_cutoff = args.vocabulary_cutoff, + normalization_actions = args.normalize, + file_encodings = args.encoding, + csls = args.csls_neighborhood, + dtype = args.precision, + ) if args.cuda: - if not supports_cupy(): - print('ERROR: Install CuPy for CUDA support', file=sys.stderr) - sys.exit(-1) - xp = get_cupy() - x = xp.asarray(x) - z = xp.asarray(z) - else: - xp = np - xp.random.seed(args.seed) - - # Build word to index map - src_word2ind = {word: i for i, word in enumerate(src_words)} - trg_word2ind = {word: i for i, word in enumerate(trg_words)} - - # STEP 0: Normalization - embeddings.normalize(x, args.normalize) - embeddings.normalize(z, args.normalize) - - # Build the seed dictionary - src_indices = [] - trg_indices = [] - if args.init_unsupervised: - sim_size = min(x.shape[0], z.shape[0]) if args.unsupervised_vocab <= 0 else min(x.shape[0], z.shape[0], args.unsupervised_vocab) - u, s, vt = xp.linalg.svd(x[:sim_size], full_matrices=False) - xsim = (u*s).dot(u.T) - u, s, vt = xp.linalg.svd(z[:sim_size], full_matrices=False) - zsim = (u*s).dot(u.T) - del u, s, vt - xsim.sort(axis=1) - zsim.sort(axis=1) - embeddings.normalize(xsim, args.normalize) - embeddings.normalize(zsim, args.normalize) - sim = xsim.dot(zsim.T) - if args.csls_neighborhood > 0: - knn_sim_fwd = topk_mean(sim, k=args.csls_neighborhood) - knn_sim_bwd = topk_mean(sim.T, k=args.csls_neighborhood) - sim -= knn_sim_fwd[:, xp.newaxis]/2 + knn_sim_bwd/2 - if args.direction == 'forward': - src_indices = xp.arange(sim_size) - trg_indices = sim.argmax(axis=1) - elif args.direction == 'backward': - src_indices = sim.argmax(axis=0) - trg_indices = xp.arange(sim_size) - elif args.direction == 'union': - src_indices = xp.concatenate((xp.arange(sim_size), sim.argmax(axis=0))) - trg_indices = xp.concatenate((sim.argmax(axis=1), xp.arange(sim_size))) - del xsim, zsim, sim - elif args.init_numerals: - numeral_regex = re.compile('^[0-9]+$') - src_numerals = {word for word in src_words if numeral_regex.match(word) is not None} - trg_numerals = {word for word in trg_words if numeral_regex.match(word) is not None} - numerals = src_numerals.intersection(trg_numerals) - for word in numerals: - src_indices.append(src_word2ind[word]) - trg_indices.append(trg_word2ind[word]) - elif args.init_identical: - identical = set(src_words).intersection(set(trg_words)) - for word in identical: - src_indices.append(src_word2ind[word]) - trg_indices.append(trg_word2ind[word]) - else: - f = open(args.init_dictionary, encoding=args.encoding, errors='surrogateescape') - for line in f: - src, trg = line.split() - try: - src_ind = src_word2ind[src] - trg_ind = trg_word2ind[trg] - src_indices.append(src_ind) - trg_indices.append(trg_ind) - except KeyError: - print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr) - - # Read validation dictionary + vecmap.to_cuda(args.device_id) + vecmap.set_seed(args.seed) + vecmap.set_train_data( + src_input = args.src_input, + trg_input = args.trg_input, + seed_dictionary = args.init_dictionary, + pca = args.pca, + ) if args.validation is not None: - f = open(args.validation, encoding=args.encoding, errors='surrogateescape') - validation = collections.defaultdict(set) - oov = set() - vocab = set() - for line in f: - src, trg = line.split() - try: - src_ind = src_word2ind[src] - trg_ind = trg_word2ind[trg] - validation[src_ind].add(trg_ind) - vocab.add(src) - except KeyError: - oov.add(src) - oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov - validation_coverage = len(validation) / (len(validation) + len(oov)) - - # Create log file - if args.log: - log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape') - - # Allocate memory - xw = xp.empty_like(x) - zw = xp.empty_like(z) - src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min(x.shape[0], args.vocabulary_cutoff) - trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min(z.shape[0], args.vocabulary_cutoff) - simfwd = xp.empty((args.batch_size, trg_size), dtype=dtype) - simbwd = xp.empty((args.batch_size, src_size), dtype=dtype) - if args.validation is not None: - simval = xp.empty((len(validation.keys()), z.shape[0]), dtype=dtype) - - best_sim_forward = xp.full(src_size, -100, dtype=dtype) - src_indices_forward = xp.arange(src_size) - trg_indices_forward = xp.zeros(src_size, dtype=int) - best_sim_backward = xp.full(trg_size, -100, dtype=dtype) - src_indices_backward = xp.zeros(trg_size, dtype=int) - trg_indices_backward = xp.arange(trg_size) - knn_sim_fwd = xp.zeros(src_size, dtype=dtype) - knn_sim_bwd = xp.zeros(trg_size, dtype=dtype) - - # Training loop - best_objective = objective = -100. - it = 1 - last_improvement = 0 - keep_prob = args.stochastic_initial - t = time.time() - end = not args.self_learning - while True: - - # Increase the keep probability if we have not improve in args.stochastic_interval iterations - if it - last_improvement > args.stochastic_interval: - if keep_prob >= 1.0: - end = True - keep_prob = min(1.0, args.stochastic_multiplier*keep_prob) - last_improvement = it - - # Update the embedding mapping - if args.orthogonal or not end: # orthogonal mapping - u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices])) - w = vt.T.dot(u.T) - x.dot(w, out=xw) - zw[:] = z - elif args.unconstrained: # unconstrained mapping - x_pseudoinv = xp.linalg.inv(x[src_indices].T.dot(x[src_indices])).dot(x[src_indices].T) - w = x_pseudoinv.dot(z[trg_indices]) - x.dot(w, out=xw) - zw[:] = z - else: # advanced mapping - - # TODO xw.dot(wx2, out=xw) and alike not working - xw[:] = x - zw[:] = z - - # STEP 1: Whitening - def whitening_transformation(m): - u, s, vt = xp.linalg.svd(m, full_matrices=False) - return vt.T.dot(xp.diag(1/s)).dot(vt) - if args.whiten: - wx1 = whitening_transformation(xw[src_indices]) - wz1 = whitening_transformation(zw[trg_indices]) - xw = xw.dot(wx1) - zw = zw.dot(wz1) - - # STEP 2: Orthogonal mapping - wx2, s, wz2_t = xp.linalg.svd(xw[src_indices].T.dot(zw[trg_indices])) - wz2 = wz2_t.T - xw = xw.dot(wx2) - zw = zw.dot(wz2) - - # STEP 3: Re-weighting - xw *= s**args.src_reweight - zw *= s**args.trg_reweight - - # STEP 4: De-whitening - if args.src_dewhiten == 'src': - xw = xw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2)) - elif args.src_dewhiten == 'trg': - xw = xw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2)) - if args.trg_dewhiten == 'src': - zw = zw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2)) - elif args.trg_dewhiten == 'trg': - zw = zw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2)) - - # STEP 5: Dimensionality reduction - if args.dim_reduction > 0: - xw = xw[:, :args.dim_reduction] - zw = zw[:, :args.dim_reduction] - - # Self-learning - if end: - break - else: - # Update the training dictionary - if args.direction in ('forward', 'union'): - if args.csls_neighborhood > 0: - for i in range(0, trg_size, simbwd.shape[0]): - j = min(i + simbwd.shape[0], trg_size) - zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i]) - knn_sim_bwd[i:j] = topk_mean(simbwd[:j-i], k=args.csls_neighborhood, inplace=True) - for i in range(0, src_size, simfwd.shape[0]): - j = min(i + simfwd.shape[0], src_size) - xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i]) - simfwd[:j-i].max(axis=1, out=best_sim_forward[i:j]) - simfwd[:j-i] -= knn_sim_bwd/2 # Equivalent to the real CSLS scores for NN - dropout(simfwd[:j-i], 1 - keep_prob).argmax(axis=1, out=trg_indices_forward[i:j]) - if args.direction in ('backward', 'union'): - if args.csls_neighborhood > 0: - for i in range(0, src_size, simfwd.shape[0]): - j = min(i + simfwd.shape[0], src_size) - xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i]) - knn_sim_fwd[i:j] = topk_mean(simfwd[:j-i], k=args.csls_neighborhood, inplace=True) - for i in range(0, trg_size, simbwd.shape[0]): - j = min(i + simbwd.shape[0], trg_size) - zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i]) - simbwd[:j-i].max(axis=1, out=best_sim_backward[i:j]) - simbwd[:j-i] -= knn_sim_fwd/2 # Equivalent to the real CSLS scores for NN - dropout(simbwd[:j-i], 1 - keep_prob).argmax(axis=1, out=src_indices_backward[i:j]) - if args.direction == 'forward': - src_indices = src_indices_forward - trg_indices = trg_indices_forward - elif args.direction == 'backward': - src_indices = src_indices_backward - trg_indices = trg_indices_backward - elif args.direction == 'union': - src_indices = xp.concatenate((src_indices_forward, src_indices_backward)) - trg_indices = xp.concatenate((trg_indices_forward, trg_indices_backward)) - - # Objective function evaluation - if args.direction == 'forward': - objective = xp.mean(best_sim_forward).tolist() - elif args.direction == 'backward': - objective = xp.mean(best_sim_backward).tolist() - elif args.direction == 'union': - objective = (xp.mean(best_sim_forward) + xp.mean(best_sim_backward)).tolist() / 2 - if objective - best_objective >= args.threshold: - last_improvement = it - best_objective = objective - - # Accuracy and similarity evaluation in validation - if args.validation is not None: - src = list(validation.keys()) - xw[src].dot(zw.T, out=simval) - nn = asnumpy(simval.argmax(axis=1)) - accuracy = np.mean([1 if nn[i] in validation[src[i]] else 0 for i in range(len(src))]) - similarity = np.mean([max([simval[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src))]) - - # Logging - duration = time.time() - t - if args.verbose: - print(file=sys.stderr) - print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr) - print('\t- Objective: {0:9.4f}%'.format(100 * objective), file=sys.stderr) - print('\t- Drop probability: {0:9.4f}%'.format(100 - 100*keep_prob), file=sys.stderr) - if args.validation is not None: - print('\t- Val. similarity: {0:9.4f}%'.format(100 * similarity), file=sys.stderr) - print('\t- Val. accuracy: {0:9.4f}%'.format(100 * accuracy), file=sys.stderr) - print('\t- Val. coverage: {0:9.4f}%'.format(100 * validation_coverage), file=sys.stderr) - sys.stderr.flush() - if args.log is not None: - val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format( - 100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else '' - print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(it, 100 * objective, val, duration), file=log) - log.flush() - - t = time.time() - it += 1 - - # Write mapped embeddings - srcfile = open(args.src_output, mode='w', encoding=args.encoding, errors='surrogateescape') - trgfile = open(args.trg_output, mode='w', encoding=args.encoding, errors='surrogateescape') - embeddings.write(src_words, xw, srcfile) - embeddings.write(trg_words, zw, trgfile) - srcfile.close() - trgfile.close() - + vecmap.set_validation_dictionary(args.validation) + if args.self_learning: + vecmap.self_learning_train( + dict_update_batch_size = args.batch_size, + objective_threshold = args.threshold, + stochastic_interval = args.stochastic_interval, + stochastic_initial = args.stochastic_initial, + ) + else: + vecmap.train() + vecmap.save_embeddings( + src_output = args.src_output, + trg_output = args.trg_output, + ) if __name__ == '__main__': main() diff --git a/vecmap.py b/vecmap.py new file mode 100644 index 0000000..061e809 --- /dev/null +++ b/vecmap.py @@ -0,0 +1,444 @@ +import numpy as np +from cupy_utils import supports_cupy, get_cupy, get_array_module, asnumpy +import embeddings +import re +import time +import collections +from sklearn.decomposition import PCA + + +def topk_mean(m, k, inplace=False): # TODO Assuming that axis is 1 + xp = get_array_module(m) + n = m.shape[0] + ans = xp.zeros(n, dtype=m.dtype) + if k <= 0: + return ans + if not inplace: + m = xp.array(m) + ind0 = xp.arange(n) + ind1 = xp.empty(n, dtype=int) + minimum = m.min() + for i in range(k): + m.argmax(axis=1, out=ind1) + ans += m[ind0, ind1] + m[ind0, ind1] = minimum + return ans / k + + +def dropout(m, p): + if p <= 0.0: + return m + else: + xp = get_array_module(m) + mask = xp.random.rand(*m.shape) >= p + return m*mask + + +class VecMap: + def __init__( + self, + training_mode : str = 'advanced', # 'orthogonal' or 'unconstrained' or 'advanced' + whiten : bool = False, + src_reweight : float = 0., + trg_reweight : float = 0., + src_dewhiten : str = None, # 'src' or 'trg' + trg_dewhiten : str = None, # 'src' or 'trg' + dim_reduction : int = None, + init_dictionary_mode : str = 'unsupervised', # 'unsupervised', 'numerals', 'identical', 'seed' + dictionary_induction_direction : str = 'union', # 'forward', 'backward', 'union' + unsupervised_dictionary_size : int = None, + vocabulary_cutoff : int = None, + normalization_actions : list = [], # ['unit', 'center', 'unitdim', 'centeremb', 'none'] + file_encodings : str = 'utf-8', + csls : int = 0, + dtype : str = 'float32', + ) -> None: + VecMap._check_training_mode_validity(training_mode) + VecMap._check_dewhiten_mode_validity(src_dewhiten) + VecMap._check_dewhiten_mode_validity(trg_dewhiten) + VecMap._check_whitening_and_dewhitening_concordancy(whiten, src_dewhiten, trg_dewhiten) + VecMap._check_init_dictionary_mode(init_dictionary_mode) + VecMap._check_normalization_actions(normalization_actions) + + self._training_mode = training_mode + self._whiten_is_active = whiten + self._src_dewhiten = src_dewhiten + self._trg_dewhiten = trg_dewhiten + self.x_rw = src_reweight + self.z_rw = trg_reweight + self.dim_reduction = dim_reduction + self._dictionary_induction_direction = dictionary_induction_direction + self._init_dictionary_mode = init_dictionary_mode + self._unsupervised_dictionary_size = np.inf if unsupervised_dictionary_size is None else unsupervised_dictionary_size + self._cutoff = np.inf if vocabulary_cutoff is None else vocabulary_cutoff + self.normalization_actions = normalization_actions + self.encoding = file_encodings + self.csls = csls + self.dtype = dtype + + self.dictionary = None + self.validation_set = None + self.xp = np + + + @staticmethod + def _check_dewhiten_mode_validity(mode): + if mode not in ['src', 'trg']: + raise Exception(f"De-whitening mode can be either 'src' or 'trg', not {mode}") + @staticmethod + def _check_whitening_and_dewhitening_concordancy(whiten, src_dewhiten, trg_dewhiten): + if (src_dewhiten is not None or trg_dewhiten is not None) and not whiten: + raise Exception('ERROR: De-whitening requires whitening first') + @staticmethod + def _check_training_mode_validity(mode): + if mode not in ['orthogonal', 'unconstrained', 'advanced']: + raise Exception(f"Training mode can be either 'orthogonal', 'unconstrained', or 'advanced', not {mode}") + @staticmethod + def _check_dictionary_induction_direction(direction): + if direction not in ['forward', 'backward', 'union']: + raise Exception(f"Dictionary induction direction can be either 'forward', 'backward', or 'union', not {direction}") + @staticmethod + def _check_init_dictionary_mode(mode): + if mode not in ['unsupervised', 'numerals', 'identical', 'seed']: + raise Exception(f"Init ictionary mode can be either 'unsupervised', 'numerals', 'identical', or 'seed', not {mode}") + @staticmethod + def _check_normalization_actions(actions): + for action in actions: + if action not in ['unit', 'center', 'unitdim', 'centeremb', 'none']: + raise Exception("Invalid normalization actions.") + @staticmethod + def _check_seed_dictionary(mode, path): + if mode=='seed' and path is None: + raise Exception("Init seed dictionary path is missing. Set the `seed_dictionary` argument.") + + + def to_cuda(self, device_id=None): + if not supports_cupy(): + raise Exception('ERROR: Install CuPy for CUDA support') + self.xp = get_cupy() + if device_id is not None: + self.xp.cuda.Device(device_id).use() + + def set_seed(self, seed): + self.xp.random.seed(seed) + + + def _whitening_transformation(self, m): + u, s, vt = self.xp.linalg.svd(m, full_matrices=False) + return vt.T @ self.xp.diag(1/s) @ vt + + + def _actual_whiten(self, a, indices): + w1 = self._whitening_transformation(a[indices]) + return a @ w1, w1 + + + def _whiten(self, a, indices): + if self._whiten_is_active: + return self._actual_whiten(a, indices) + else: + return a, None + + + def _bidirectional_orthogonal_map(self, x, ix, z, iz): + wx, s, wz_t = self.xp.linalg.svd(x[ix].T @ z[iz]) + wz = wz_t.T + size = min(wx.shape[0], wz.shape[0]) + wx, wz = wx[:, :size], wz[:, :size] + xw, zw = x @ wx, z @ wz + return xw, zw, wx, wz, s + + + def _reweight(self, a, s, rw=0): + return a * s**rw + + + def _actual_dewhiten(self, a, w1, w2): + return a @ w2.T @ self.xp.linalg.inv(w1) @ w2 + + + def _dewhiten(self, a, wx1, wz1, wx2, wz2, mode): + if mode is None: + return a + if mode=='src': + w1, w2 = wx1, wx2 + if mode=='trg': + w1, w2 = wz1, wz2 + return self._actual_dewhiten(a, w1, w2) + + + def _dim_reduction(self, a): + return a[:, :self.dim_reduction] + + + def _advanced_map(self, x, ix, z, iz): + xw = x.copy() + zw = z.copy() + + xw, wx1 = self._whiten(xw, ix) + zw, wz1 = self._whiten(zw, iz) + + xw, zw, wx2, wz2, s = self._bidirectional_orthogonal_map(xw, ix, zw, iz) + + xw = self._reweight(xw, s, self.x_rw) + zw = self._reweight(zw, s, self.z_rw) + + xw = self._dewhiten(xw, wx1, wz1, wx2, wz2, self._src_dewhiten) + zw = self._dewhiten(zw, wx1, wz1, wx2, wz2, self._trg_dewhiten) + + xw = self._dim_reduction(xw) + zw = self._dim_reduction(zw) + + return xw, zw + + + def _orthogonal_map(self, x, ix, z, iz): + u, s, v_t = self.xp.linalg.svd(z[iz].T @ x[ix]) + size = min(u.shape[0], v_t.shape[0]) + u, v_t = u[:, :size], v_t[:size, :] + xw = x @ v_t.T @ u.T + zw = z.copy() + return xw, zw + + + def _unconstrained_map(self, x, ix, z, iz): + w = self.xp.linalg.inv(x[ix].T @ x[ix]) @ x[ix].T @ z[iz] + xw = x @ w + zw = z.copy() + return xw, zw + + + def _map(self, x, ix, z, iz, is_last_iteration): + if self._training_mode=='advanced': + if is_last_iteration: + return self._advanced_map(x, ix, z, iz) + else: + return self._orthogonal_map(x, ix, z, iz) + if self._training_mode=='orthogonal': + return self._orthogonal_map(x, ix, z, iz) + if self._training_mode=='unconstrained': + return self._unconstrained_map(x, ix, z, iz) + + + def _get_indices(self, sim): + if self._dictionary_induction_direction=='forward': + return self.xp.arange(sim.shape[1]), sim.argmax(axis=1) + if self._dictionary_induction_direction=='backward': + return sim.argmax(axis=0), self.xp.arange(sim.shape[0]) + if self._dictionary_induction_direction=='union': + return self.xp.concatenate((self.xp.arange(sim.shape[1]), sim.argmax(axis=0))), \ + self.xp.concatenate((sim.argmax(axis=1), self.xp.arange(sim.shape[0]))) + + + def _build_unsupervised_seed_dictionary(self): + size = min(self.x.shape[0], self.z.shape[0], self._unsupervised_dictionary_size) + u, s, vt = self.xp.linalg.svd(self.x[:size], full_matrices=False) + xsim = (u*s) @ u.T + u, s, vt = self.xp.linalg.svd(self.z[:size], full_matrices=False) + zsim = (u*s) @ u.T + del u, s, vt + xsim.sort(axis=1) + zsim.sort(axis=1) + embeddings.normalize(xsim, self.normalization_actions) + embeddings.normalize(zsim, self.normalization_actions) + sim = xsim @ zsim.T + if self.csls > 0: + knn_sim_fwd = topk_mean(sim, k=self.csls) + knn_sim_bwd = topk_mean(sim.T, k=self.csls) + sim -= knn_sim_fwd[:, self.xp.newaxis]/2 + knn_sim_bwd/2 + return self._get_indices(sim) + + + def _build_identical_seed_dictionary(self, x_words, z_words): + identicals = set(x_words).intersection(set(z_words)) + xi, zi = [], [] + for word in identicals: + xi.append(self.src_word2ind[word]) + zi.append(self.trg_word2ind[word]) + return xi, zi + + + def _build_numerals_seed_dictionary(self): + numeral_regex = re.compile('^[0-9]+$') + x_numerals = {word for word in self.x_words if numeral_regex.match(word) is not None} + z_numerals = {word for word in self.z_words if numeral_regex.match(word) is not None} + return self._build_identical_seed_dictionary(x_numerals, z_numerals) + + + def _read_dictionary(self, path): + xi, zi = [], [] + oov = [] + with open(path, encoding=self.encoding, errors='surrogateescape') as f: + for line in f: + x_word, z_word = line.split() + try: + x, z = self.src_word2ind[x_word], self.trg_word2ind[z_word] + xi.append(x) + zi.append(z) + except KeyError: + oov.append((x_word, z_word)) + return xi, zi, oov + + + def _build_seed_dictionary(self, seed_dictionary_path): + if self._init_dictionary_mode=='unsupervised': + return self._build_unsupervised_seed_dictionary() + if self._init_dictionary_mode=='numerals': + return self._build_numerals_seed_dictionary() + if self._init_dictionary_mode=='identical': + return self._build_identical_seed_dictionary(self.x_words, self.z_words) + if self._init_dictionary_mode=='seed': + xi, zi, _ = self._read_dictionary(seed_dictionary_path) + return xi, zi + + + def _select_indices_for_dictionary(self, x, x_size, z, z_size, keep_prob, batch_size): + knn_sim = self.xp.zeros(z_size, dtype=self.dtype) + if self.csls > 0: + for b in range(0, z_size, batch_size): + sim = z[b:b+batch_size] @ x[:x_size].T + knn_sim[b:b+batch_size] = topk_mean(sim, k=self.csls, inplace=True) + best_sim = self.xp.full(x_size, -100, dtype=self.dtype) + indices = self.xp.zeros(x_size, dtype=int) + for b in range(0, x_size, batch_size): + sim = x[b:b+batch_size] @ z[:z_size].T + best_sim[b:b+batch_size] = sim.max(axis=1) + sim -= knn_sim/2 + indices[b:b+batch_size] = dropout(sim, 1 - keep_prob).argmax(axis=1) + return indices, self.xp.mean(best_sim) + + + def _rebuild_dictionary(self, x, z, keep_prob, batch_size): + x_size = min(x.shape[0], self._cutoff) + z_size = min(z.shape[0], self._cutoff) + if self._dictionary_induction_direction=='forward': + xi = self.xp.arange(x_size) + zi, objective = self._select_indices_for_dictionary(x, x_size, z, z_size, keep_prob, batch_size) + if self._dictionary_induction_direction=='backward': + xi, objective = self._select_indices_for_dictionary(z, z_size, x, x_size, keep_prob, batch_size) + zi = self.xp.arange(z_size) + if self._dictionary_induction_direction=='union': + xi, objective1 = self._select_indices_for_dictionary(z, z_size, x, x_size, keep_prob, batch_size) + zi, objective2 = self._select_indices_for_dictionary(x, x_size, z, z_size, keep_prob, batch_size) + objective = (objective1 + objective2) / 2 + xi = self.xp.concatenate((self.xp.arange(x_size), xi)) + zi = self.xp.concatenate((zi, self.xp.arange(z_size))) + return (xi, zi), objective + + + def set_validation_dictionary(self, path): + xi, zi, _ = self._read_dictionary(path) + self.validation_set = collections.defaultdict(set) + for x, z in zip(xi, zi): + self.validation_set[x].add(z) + print(f"Validation set length: {len(self.validation_set)}") + + + def validate(self, x, z): + src = list(self.validation_set.keys()) + simval = x[src] @ z.T + nn = asnumpy(simval.argmax(axis=1)) + accuracy = np.mean([(nn[i] in self.validation_set[src[i]]) for i in range(len(src))]) + similarity = np.mean([max([simval[i, j].tolist() for j in self.validation_set[src[i]]]) for i in range(len(src))]) + return accuracy, similarity + + + def _log(self, itr, duration, objective, keep_prob, similarity=None, accuracy=None): + print(f'ITERATION {itr} - DURATION {duration}s:') + print(f'\t- Objective: {objective}') + print(f'\t- Drop probability: {1-keep_prob}') + if accuracy is not None: + print(f'\t- Val. similarity: {similarity}') + print(f'\t- Val. accuracy: {accuracy}') + + + def set_train_data( + self, + src_input : str, + trg_input : str, + seed_dictionary : str = None, + pca : bool = False, + n_components : int = None, + ) -> None: + VecMap._check_seed_dictionary(self._init_dictionary_mode, seed_dictionary) + with open(src_input, encoding=self.encoding, errors='surrogateescape') as srcfile: + self.x_words, x = embeddings.read(srcfile, dtype=self.dtype) + with open(trg_input, encoding=self.encoding, errors='surrogateescape') as trgfile: + self.z_words, z = embeddings.read(trgfile, dtype=self.dtype) + if pca: + if n_components is None: + n_components = min(x.shape[1], z.shape[1]) + x = PCA(n_components=n_components, svd_solver='full').fit_transform(x) + z = PCA(n_components=n_components, svd_solver='full').fit_transform(z) + self.x = self.xp.asarray(x) + self.z = self.xp.asarray(z) + embeddings.normalize(self.x, self.normalization_actions) + embeddings.normalize(self.z, self.normalization_actions) + self.src_word2ind = {word: i for i, word in enumerate(self.x_words)} + self.trg_word2ind = {word: i for i, word in enumerate(self.z_words)} + self.ix, self.iz = self._build_seed_dictionary(seed_dictionary) + + + def train( + self, + ) -> None: + self.xw, self.zw = self._map(self.x, self.ix, self.z, self.iz, True) + + + def self_learning_train( + self, + dict_update_batch_size : int = 10000, + stochastic_initial : float = 0.1, + objective_threshold : float = 0.000001, + stochastic_interval : int = 50, + stochastic_multiplier : float = 2., + log : bool = True, + ) -> None: + xw, zw = self.x.copy(), self.z.copy() + + finish = False + best_objective = objective = -100 + last_improvement = -1 + t0 = None + keep_prob = stochastic_initial + + itr = 0 + while not finish: + if itr: + (self.ix, self.iz), objective = self._rebuild_dictionary(xw, zw, keep_prob, dict_update_batch_size) + + if log: + accuracy, similarity = None, None + if self.validation_set: + accuracy, similarity = self.validate(xw, zw) + + duration = time.time() - t0 if t0 is not None else 0 + self._log(itr, duration, objective, keep_prob, similarity, accuracy) + + if objective - best_objective >= objective_threshold: + last_improvement = itr + best_objective = objective + if keep_prob >= 1.0: + finish = True + if itr - last_improvement > stochastic_interval: + keep_prob = min(1.0, stochastic_multiplier*keep_prob) + last_improvement = itr + + t0 = time.time() + xw, zw = self._map(self.x, self.ix, self.z, self.iz, finish) + + itr += 1 + + self.xw, self.zw = xw, zw + + + def save_embeddings( + self, + src_output : str, + trg_output : str, + ) -> None: + for words, emb, file_name in [[self.x_words, self.xw, src_output], + [self.z_words, self.zw, trg_output]]: + with open(file_name, mode='w', encoding=self.encoding, errors='surrogateescape') as f: + embeddings.write(words, emb, f) +